blob: 8e1161e5387b44bdcfa879c983a73849c87514cc [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020044#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040047#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010048#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000049#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050050#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070051#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000053#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000054#include <windows.h>
55#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000056
Victor Stinnerfecc4f22019-03-19 14:20:29 +010057/* Uncomment to display statistics on interned strings at exit when
58 using Valgrind or Insecure++. */
59/* #define INTERNED_STATS 1 */
60
61
Larry Hastings61272b72014-01-07 12:41:53 -080062/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090063class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080064[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090065/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
66
67/*[python input]
68class Py_UCS4_converter(CConverter):
69 type = 'Py_UCS4'
70 converter = 'convert_uc'
71
72 def converter_init(self):
73 if self.default is not unspecified:
74 self.c_default = ascii(self.default)
75 if len(self.c_default) > 4 or self.c_default[0] != "'":
76 self.c_default = hex(ord(self.default))
77
78[python start generated code]*/
79/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080080
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Victor Stinner8faf8212011-12-08 22:14:11 +010094/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
95#define MAX_UNICODE 0x10ffff
96
Victor Stinner910337b2011-10-03 03:20:16 +020097#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020098# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020099#else
100# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200102
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103#define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200111#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200119#define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121#define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123#define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125#define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200135#define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200137
Victor Stinner910337b2011-10-03 03:20:16 +0200138#undef PyUnicode_READY
139#define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200142 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100143 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200157 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
Victor Stinner03490912011-10-03 23:45:12 +0200160/* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200163 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
Victor Stinner910337b2011-10-03 03:20:16 +0200167/* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100174 to_type *_to = (to_type *)(to); \
175 const from_type *_iter = (from_type *)(begin); \
176 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200189 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200190
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200191#ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193# define OVERALLOCATE_FACTOR 2
194#else
195 /* On Linux, overallocate by 25% is the best factor */
196# define OVERALLOCATE_FACTOR 4
197#endif
198
Walter Dörwald16807132007-05-25 13:52:07 +0000199/* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000206*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200207static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Victor Stinner59423e32018-11-26 13:40:01 +0100231static inline void
232unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234{
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100239 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100254 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263}
264
265
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700267static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900269static inline void
270_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400271static PyObject *
272unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
273 const char *errors);
274static PyObject *
275unicode_decode_utf8(const char *s, Py_ssize_t size,
276 _Py_error_handler error_handler, const char *errors,
277 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200279/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200280static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282/* Single character Unicode strings in the Latin-1 range are being
283 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200284static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285
Christian Heimes190d79e2008-01-30 11:58:22 +0000286/* Fast detection of the most frequent whitespace characters */
287const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000289/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000C: * FORM FEED */
293/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 0, 1, 1, 1, 1, 1, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x001C: * FILE SEPARATOR */
297/* case 0x001D: * GROUP SEPARATOR */
298/* case 0x001E: * RECORD SEPARATOR */
299/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 1, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200317/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200318static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100320static int unicode_modifiable(PyObject *unicode);
321
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100324_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200325static PyObject *
326_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
327static PyObject *
328_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
329
330static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000331unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000332 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100333 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
335
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336static void
337raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300338 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100339 PyObject *unicode,
340 Py_ssize_t startpos, Py_ssize_t endpos,
341 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000342
Christian Heimes190d79e2008-01-30 11:58:22 +0000343/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200344static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347/* 0x000B, * LINE TABULATION */
348/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000350 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000352/* 0x001C, * FILE SEPARATOR */
353/* 0x001D, * GROUP SEPARATOR */
354/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 1, 1, 1, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000360
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000369};
370
INADA Naoki3ae20562017-01-16 20:41:20 +0900371static int convert_uc(PyObject *obj, void *addr);
372
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300373#include "clinic/unicodeobject.c.h"
374
Victor Stinner3d4226a2018-08-29 22:21:32 +0200375_Py_error_handler
376_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200377{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200394 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200397 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_OTHER;
400}
401
Victor Stinner709d23d2019-05-02 14:56:30 -0400402
403static _Py_error_handler
404get_error_handler_wide(const wchar_t *errors)
405{
406 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
407 return _Py_ERROR_STRICT;
408 }
409 if (wcscmp(errors, L"surrogateescape") == 0) {
410 return _Py_ERROR_SURROGATEESCAPE;
411 }
412 if (wcscmp(errors, L"replace") == 0) {
413 return _Py_ERROR_REPLACE;
414 }
415 if (wcscmp(errors, L"ignore") == 0) {
416 return _Py_ERROR_IGNORE;
417 }
418 if (wcscmp(errors, L"backslashreplace") == 0) {
419 return _Py_ERROR_BACKSLASHREPLACE;
420 }
421 if (wcscmp(errors, L"surrogatepass") == 0) {
422 return _Py_ERROR_SURROGATEPASS;
423 }
424 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
425 return _Py_ERROR_XMLCHARREFREPLACE;
426 }
427 return _Py_ERROR_OTHER;
428}
429
430
Victor Stinner22eb6892019-06-26 00:51:05 +0200431static inline int
432unicode_check_encoding_errors(const char *encoding, const char *errors)
433{
434 if (encoding == NULL && errors == NULL) {
435 return 0;
436 }
437
438 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
439#ifndef Py_DEBUG
440 /* In release mode, only check in development mode (-X dev) */
441 if (!interp->config.dev_mode) {
442 return 0;
443 }
444#else
445 /* Always check in debug mode */
446#endif
447
448 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
449 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
450 if (!interp->fs_codec.encoding) {
451 return 0;
452 }
453
454 if (encoding != NULL) {
455 PyObject *handler = _PyCodec_Lookup(encoding);
456 if (handler == NULL) {
457 return -1;
458 }
459 Py_DECREF(handler);
460 }
461
462 if (errors != NULL) {
463 PyObject *handler = PyCodec_LookupError(errors);
464 if (handler == NULL) {
465 return -1;
466 }
467 Py_DECREF(handler);
468 }
469 return 0;
470}
471
472
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300473/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
474 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000475Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000476PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000477{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000478#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000479 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000480#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000481 /* This is actually an illegal character, so it should
482 not be passed to unichr. */
483 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484#endif
485}
486
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200487int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100488_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200489{
Victor Stinner68762572019-10-07 18:42:01 +0200490#define CHECK(expr) \
491 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
492
Victor Stinner910337b2011-10-03 03:20:16 +0200493 PyASCIIObject *ascii;
494 unsigned int kind;
495
Victor Stinner68762572019-10-07 18:42:01 +0200496 assert(op != NULL);
497 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200498
499 ascii = (PyASCIIObject *)op;
500 kind = ascii->state.kind;
501
Victor Stinnera3b334d2011-10-03 13:53:37 +0200502 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200503 CHECK(kind == PyUnicode_1BYTE_KIND);
504 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200505 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200507 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200508 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200509
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 if (ascii->state.compact == 1) {
511 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200512 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200513 || kind == PyUnicode_2BYTE_KIND
514 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200515 CHECK(ascii->state.ascii == 0);
516 CHECK(ascii->state.ready == 1);
517 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100518 }
519 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
521
522 data = unicode->data.any;
523 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200524 CHECK(ascii->length == 0);
525 CHECK(ascii->hash == -1);
526 CHECK(ascii->state.compact == 0);
527 CHECK(ascii->state.ascii == 0);
528 CHECK(ascii->state.ready == 0);
529 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
530 CHECK(ascii->wstr != NULL);
531 CHECK(data == NULL);
532 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 }
534 else {
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->state.compact == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 == data);
543 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200544 }
545 else
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 }
548 }
549 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200550 if (
551#if SIZEOF_WCHAR_T == 2
552 kind == PyUnicode_2BYTE_KIND
553#else
554 kind == PyUnicode_4BYTE_KIND
555#endif
556 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 {
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(ascii->wstr == data);
559 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 } else
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200562 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200563
564 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200568 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200569
570 /* check that the best kind is used: O(n) operation */
571 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200572 Py_ssize_t i;
573 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200574 void *data;
575 Py_UCS4 ch;
576
577 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 for (i=0; i < ascii->length; i++)
579 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200580 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200581 if (ch > maxchar)
582 maxchar = ch;
583 }
584 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100585 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar >= 128);
587 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100588 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200589 else
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 }
Victor Stinner77faf692011-11-20 18:56:05 +0100592 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x100);
594 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
596 else {
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar >= 0x10000);
598 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100599 }
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400602 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200603
604#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400605}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200606
Victor Stinner910337b2011-10-03 03:20:16 +0200607
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100608static PyObject*
609unicode_result_wchar(PyObject *unicode)
610{
611#ifndef Py_DEBUG
612 Py_ssize_t len;
613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 len = _PyUnicode_WSTR_LENGTH(unicode);
615 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200617 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100618 }
619
620 if (len == 1) {
621 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100622 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
624 Py_DECREF(unicode);
625 return latin1_char;
626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
650 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200652 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 }
654 return unicode_empty;
655 }
656
657 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200658 void *data = PyUnicode_DATA(unicode);
659 int kind = PyUnicode_KIND(unicode);
660 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 if (ch < 256) {
662 PyObject *latin1_char = unicode_latin1[ch];
663 if (latin1_char != NULL) {
664 if (unicode != latin1_char) {
665 Py_INCREF(latin1_char);
666 Py_DECREF(unicode);
667 }
668 return latin1_char;
669 }
670 else {
671 assert(_PyUnicode_CheckConsistency(unicode, 1));
672 Py_INCREF(unicode);
673 unicode_latin1[ch] = unicode;
674 return unicode;
675 }
676 }
677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
716 void *data;
717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
783 void *data;
784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
825 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 }
827 return str;
828}
829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830/* --- Bloom Filters ----------------------------------------------------- */
831
832/* stuff to implement simple "bloom filters" for Unicode characters.
833 to keep things simple, we use a single bitmask, using the least 5
834 bits from each unicode characters as the bit index. */
835
836/* the linebreak mask is set up by Unicode_Init below */
837
Antoine Pitrouf068f942010-01-13 14:19:12 +0000838#if LONG_BIT >= 128
839#define BLOOM_WIDTH 128
840#elif LONG_BIT >= 64
841#define BLOOM_WIDTH 64
842#elif LONG_BIT >= 32
843#define BLOOM_WIDTH 32
844#else
845#error "LONG_BIT is smaller than 32"
846#endif
847
Thomas Wouters477c8d52006-05-27 19:21:47 +0000848#define BLOOM_MASK unsigned long
849
Serhiy Storchaka05997252013-01-26 12:14:02 +0200850static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000851
Antoine Pitrouf068f942010-01-13 14:19:12 +0000852#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853
Benjamin Peterson29060642009-01-31 22:14:21 +0000854#define BLOOM_LINEBREAK(ch) \
855 ((ch) < 128U ? ascii_linebreak[(ch)] : \
856 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700858static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860{
Victor Stinnera85af502013-04-09 21:53:54 +0200861#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
862 do { \
863 TYPE *data = (TYPE *)PTR; \
864 TYPE *end = data + LEN; \
865 Py_UCS4 ch; \
866 for (; data != end; data++) { \
867 ch = *data; \
868 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
869 } \
870 break; \
871 } while (0)
872
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873 /* calculate simple bloom-style bitmask for a given unicode string */
874
Antoine Pitrouf068f942010-01-13 14:19:12 +0000875 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000876
877 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200878 switch (kind) {
879 case PyUnicode_1BYTE_KIND:
880 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
881 break;
882 case PyUnicode_2BYTE_KIND:
883 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
884 break;
885 case PyUnicode_4BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
887 break;
888 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700889 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200892
893#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000894}
895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896static int
897ensure_unicode(PyObject *obj)
898{
899 if (!PyUnicode_Check(obj)) {
900 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200901 "must be str, not %.100s",
902 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903 return -1;
904 }
905 return PyUnicode_READY(obj);
906}
907
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200908/* Compilation of templated routines */
909
910#include "stringlib/asciilib.h"
911#include "stringlib/fastsearch.h"
912#include "stringlib/partition.h"
913#include "stringlib/split.h"
914#include "stringlib/count.h"
915#include "stringlib/find.h"
916#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917#include "stringlib/undef.h"
918
919#include "stringlib/ucs1lib.h"
920#include "stringlib/fastsearch.h"
921#include "stringlib/partition.h"
922#include "stringlib/split.h"
923#include "stringlib/count.h"
924#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300925#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200926#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200927#include "stringlib/undef.h"
928
929#include "stringlib/ucs2lib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300935#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/undef.h"
938
939#include "stringlib/ucs4lib.h"
940#include "stringlib/fastsearch.h"
941#include "stringlib/partition.h"
942#include "stringlib/split.h"
943#include "stringlib/count.h"
944#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300945#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/undef.h"
948
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200949#include "stringlib/unicodedefs.h"
950#include "stringlib/fastsearch.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100953#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955/* --- Unicode Object ----------------------------------------------------- */
956
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700957static inline Py_ssize_t
958findchar(const void *s, int kind,
959 Py_ssize_t size, Py_UCS4 ch,
960 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962 switch (kind) {
963 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS1) ch != ch)
965 return -1;
966 if (direction > 0)
967 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
968 else
969 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS2) ch != ch)
972 return -1;
973 if (direction > 0)
974 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
975 else
976 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if (direction > 0)
979 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
980 else
981 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700983 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
Victor Stinnerafffce42012-10-03 23:03:17 +0200987#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000988/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200989 earlier.
990
991 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
992 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
993 invalid character in Unicode 6.0. */
994static void
995unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
996{
997 int kind = PyUnicode_KIND(unicode);
998 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
999 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1000 if (length <= old_length)
1001 return;
1002 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1003}
1004#endif
1005
Victor Stinnerfe226c02011-10-03 03:52:20 +02001006static PyObject*
1007resize_compact(PyObject *unicode, Py_ssize_t length)
1008{
1009 Py_ssize_t char_size;
1010 Py_ssize_t struct_size;
1011 Py_ssize_t new_size;
1012 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001013 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001014#ifdef Py_DEBUG
1015 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1016#endif
1017
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001020 assert(PyUnicode_IS_COMPACT(unicode));
1021
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001022 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001023 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024 struct_size = sizeof(PyASCIIObject);
1025 else
1026 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001027 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1030 PyErr_NoMemory();
1031 return NULL;
1032 }
1033 new_size = (struct_size + (length + 1) * char_size);
1034
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001035 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1036 PyObject_DEL(_PyUnicode_UTF8(unicode));
1037 _PyUnicode_UTF8(unicode) = NULL;
1038 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1039 }
Victor Stinner84def372011-12-11 20:04:56 +01001040 _Py_DEC_REFTOTAL;
1041 _Py_ForgetReference(unicode);
1042
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001043 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001044 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001045 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001046 PyErr_NoMemory();
1047 return NULL;
1048 }
Victor Stinner84def372011-12-11 20:04:56 +01001049 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001051
Victor Stinnerfe226c02011-10-03 03:52:20 +02001052 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001053 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001055 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001056 _PyUnicode_WSTR_LENGTH(unicode) = length;
1057 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001058 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1059 PyObject_DEL(_PyUnicode_WSTR(unicode));
1060 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001061 if (!PyUnicode_IS_ASCII(unicode))
1062 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001063 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001064#ifdef Py_DEBUG
1065 unicode_fill_invalid(unicode, old_length);
1066#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001067 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1068 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001069 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 return unicode;
1071}
1072
Alexander Belopolsky40018472011-02-26 01:02:56 +00001073static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001074resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001075{
Victor Stinner95663112011-10-04 01:03:50 +02001076 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001077 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001078 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001080
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081 if (PyUnicode_IS_READY(unicode)) {
1082 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001083 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001085#ifdef Py_DEBUG
1086 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1087#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088
1089 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001090 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001091 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1092 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001093
1094 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1095 PyErr_NoMemory();
1096 return -1;
1097 }
1098 new_size = (length + 1) * char_size;
1099
Victor Stinner7a9105a2011-12-12 00:13:42 +01001100 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1101 {
1102 PyObject_DEL(_PyUnicode_UTF8(unicode));
1103 _PyUnicode_UTF8(unicode) = NULL;
1104 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1105 }
1106
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107 data = (PyObject *)PyObject_REALLOC(data, new_size);
1108 if (data == NULL) {
1109 PyErr_NoMemory();
1110 return -1;
1111 }
1112 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001113 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001114 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001115 _PyUnicode_WSTR_LENGTH(unicode) = length;
1116 }
1117 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001118 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 _PyUnicode_UTF8_LENGTH(unicode) = length;
1120 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001121 _PyUnicode_LENGTH(unicode) = length;
1122 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001123#ifdef Py_DEBUG
1124 unicode_fill_invalid(unicode, old_length);
1125#endif
Victor Stinner95663112011-10-04 01:03:50 +02001126 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001127 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001128 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001130 }
Victor Stinner95663112011-10-04 01:03:50 +02001131 assert(_PyUnicode_WSTR(unicode) != NULL);
1132
1133 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001134 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001135 PyErr_NoMemory();
1136 return -1;
1137 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001138 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001139 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001140 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001141 if (!wstr) {
1142 PyErr_NoMemory();
1143 return -1;
1144 }
1145 _PyUnicode_WSTR(unicode) = wstr;
1146 _PyUnicode_WSTR(unicode)[length] = 0;
1147 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001148 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149 return 0;
1150}
1151
Victor Stinnerfe226c02011-10-03 03:52:20 +02001152static PyObject*
1153resize_copy(PyObject *unicode, Py_ssize_t length)
1154{
1155 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001156 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001157 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001158
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001159 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001160
1161 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1162 if (copy == NULL)
1163 return NULL;
1164
1165 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001166 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001167 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001168 }
1169 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001170 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001171
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001172 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001173 if (w == NULL)
1174 return NULL;
1175 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1176 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001177 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001178 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001179 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001180 }
1181}
1182
Guido van Rossumd57fd912000-03-10 22:53:23 +00001183/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001184 Ux0000 terminated; some code (e.g. new_identifier)
1185 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001186
1187 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001188 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001189
1190*/
1191
Alexander Belopolsky40018472011-02-26 01:02:56 +00001192static PyUnicodeObject *
1193_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001195 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001196 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197
Thomas Wouters477c8d52006-05-27 19:21:47 +00001198 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001199 if (length == 0 && unicode_empty != NULL) {
1200 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001201 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001202 }
1203
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001204 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001205 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001206 return (PyUnicodeObject *)PyErr_NoMemory();
1207 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001208 if (length < 0) {
1209 PyErr_SetString(PyExc_SystemError,
1210 "Negative size passed to _PyUnicode_New");
1211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001212 }
1213
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001214 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1215 if (unicode == NULL)
1216 return NULL;
1217 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001218
1219 _PyUnicode_WSTR_LENGTH(unicode) = length;
1220 _PyUnicode_HASH(unicode) = -1;
1221 _PyUnicode_STATE(unicode).interned = 0;
1222 _PyUnicode_STATE(unicode).kind = 0;
1223 _PyUnicode_STATE(unicode).compact = 0;
1224 _PyUnicode_STATE(unicode).ready = 0;
1225 _PyUnicode_STATE(unicode).ascii = 0;
1226 _PyUnicode_DATA_ANY(unicode) = NULL;
1227 _PyUnicode_LENGTH(unicode) = 0;
1228 _PyUnicode_UTF8(unicode) = NULL;
1229 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1230
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001231 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1232 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001233 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001234 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001235 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001236 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001237
Jeremy Hyltond8082792003-09-16 19:41:39 +00001238 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001239 * the caller fails before initializing str -- unicode_resize()
1240 * reads str[0], and the Keep-Alive optimization can keep memory
1241 * allocated for str alive across a call to unicode_dealloc(unicode).
1242 * We don't want unicode_resize to read uninitialized memory in
1243 * that case.
1244 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001245 _PyUnicode_WSTR(unicode)[0] = 0;
1246 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001247
Victor Stinner7931d9a2011-11-04 00:22:48 +01001248 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001249 return unicode;
1250}
1251
Victor Stinnerf42dc442011-10-02 23:33:16 +02001252static const char*
1253unicode_kind_name(PyObject *unicode)
1254{
Victor Stinner42dfd712011-10-03 14:41:45 +02001255 /* don't check consistency: unicode_kind_name() is called from
1256 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001257 if (!PyUnicode_IS_COMPACT(unicode))
1258 {
1259 if (!PyUnicode_IS_READY(unicode))
1260 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001261 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001262 {
1263 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001264 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001265 return "legacy ascii";
1266 else
1267 return "legacy latin1";
1268 case PyUnicode_2BYTE_KIND:
1269 return "legacy UCS2";
1270 case PyUnicode_4BYTE_KIND:
1271 return "legacy UCS4";
1272 default:
1273 return "<legacy invalid kind>";
1274 }
1275 }
1276 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001277 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001278 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001279 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001280 return "ascii";
1281 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001282 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001283 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001284 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001285 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001286 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001287 default:
1288 return "<invalid compact kind>";
1289 }
1290}
1291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001292#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001293/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001294char *_PyUnicode_utf8(void *unicode_raw){
1295 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001296 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297}
1298
Victor Stinnera42de742018-11-22 10:25:22 +01001299void *_PyUnicode_compact_data(void *unicode_raw) {
1300 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301 return _PyUnicode_COMPACT_DATA(unicode);
1302}
Victor Stinnera42de742018-11-22 10:25:22 +01001303void *_PyUnicode_data(void *unicode_raw) {
1304 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001305 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1307 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1308 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1309 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1310 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1311 return PyUnicode_DATA(unicode);
1312}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001313
1314void
1315_PyUnicode_Dump(PyObject *op)
1316{
1317 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001318 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1319 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1320 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001321
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001323 {
1324 if (ascii->state.ascii)
1325 data = (ascii + 1);
1326 else
1327 data = (compact + 1);
1328 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001329 else
1330 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001331 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1332 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001333
Victor Stinnera849a4b2011-10-03 12:12:11 +02001334 if (ascii->wstr == data)
1335 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001336 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001337
Victor Stinnera3b334d2011-10-03 13:53:37 +02001338 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001339 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001340 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1341 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001342 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001343 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001344 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001346}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347#endif
1348
1349PyObject *
1350PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1351{
1352 PyObject *obj;
1353 PyCompactUnicodeObject *unicode;
1354 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001355 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001356 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001357 Py_ssize_t char_size;
1358 Py_ssize_t struct_size;
1359
1360 /* Optimization for empty strings */
1361 if (size == 0 && unicode_empty != NULL) {
1362 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001363 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001364 }
1365
Victor Stinner9e9d6892011-10-04 01:02:02 +02001366 is_ascii = 0;
1367 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 struct_size = sizeof(PyCompactUnicodeObject);
1369 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001370 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001371 char_size = 1;
1372 is_ascii = 1;
1373 struct_size = sizeof(PyASCIIObject);
1374 }
1375 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001376 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001377 char_size = 1;
1378 }
1379 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001380 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 char_size = 2;
1382 if (sizeof(wchar_t) == 2)
1383 is_sharing = 1;
1384 }
1385 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001386 if (maxchar > MAX_UNICODE) {
1387 PyErr_SetString(PyExc_SystemError,
1388 "invalid maximum character passed to PyUnicode_New");
1389 return NULL;
1390 }
Victor Stinner8f825062012-04-27 13:55:39 +02001391 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 char_size = 4;
1393 if (sizeof(wchar_t) == 4)
1394 is_sharing = 1;
1395 }
1396
1397 /* Ensure we won't overflow the size. */
1398 if (size < 0) {
1399 PyErr_SetString(PyExc_SystemError,
1400 "Negative size passed to PyUnicode_New");
1401 return NULL;
1402 }
1403 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1404 return PyErr_NoMemory();
1405
1406 /* Duplicated allocation code from _PyObject_New() instead of a call to
1407 * PyObject_New() so we are able to allocate space for the object and
1408 * it's data buffer.
1409 */
1410 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1411 if (obj == NULL)
1412 return PyErr_NoMemory();
1413 obj = PyObject_INIT(obj, &PyUnicode_Type);
1414 if (obj == NULL)
1415 return NULL;
1416
1417 unicode = (PyCompactUnicodeObject *)obj;
1418 if (is_ascii)
1419 data = ((PyASCIIObject*)obj) + 1;
1420 else
1421 data = unicode + 1;
1422 _PyUnicode_LENGTH(unicode) = size;
1423 _PyUnicode_HASH(unicode) = -1;
1424 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001425 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426 _PyUnicode_STATE(unicode).compact = 1;
1427 _PyUnicode_STATE(unicode).ready = 1;
1428 _PyUnicode_STATE(unicode).ascii = is_ascii;
1429 if (is_ascii) {
1430 ((char*)data)[size] = 0;
1431 _PyUnicode_WSTR(unicode) = NULL;
1432 }
Victor Stinner8f825062012-04-27 13:55:39 +02001433 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 ((char*)data)[size] = 0;
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001438 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001439 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001440 else {
1441 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001442 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001443 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001445 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001446 ((Py_UCS4*)data)[size] = 0;
1447 if (is_sharing) {
1448 _PyUnicode_WSTR_LENGTH(unicode) = size;
1449 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1450 }
1451 else {
1452 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1453 _PyUnicode_WSTR(unicode) = NULL;
1454 }
1455 }
Victor Stinner8f825062012-04-27 13:55:39 +02001456#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001457 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001458#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001459 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001460 return obj;
1461}
1462
1463#if SIZEOF_WCHAR_T == 2
1464/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1465 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001466 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467
1468 This function assumes that unicode can hold one more code point than wstr
1469 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001470static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001472 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473{
1474 const wchar_t *iter;
1475 Py_UCS4 *ucs4_out;
1476
Victor Stinner910337b2011-10-03 03:20:16 +02001477 assert(unicode != NULL);
1478 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001479 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1480 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1481
1482 for (iter = begin; iter < end; ) {
1483 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1484 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001485 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1486 && (iter+1) < end
1487 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001488 {
Victor Stinner551ac952011-11-29 22:58:13 +01001489 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 iter += 2;
1491 }
1492 else {
1493 *ucs4_out++ = *iter;
1494 iter++;
1495 }
1496 }
1497 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1498 _PyUnicode_GET_LENGTH(unicode)));
1499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001500}
1501#endif
1502
Victor Stinnercd9950f2011-10-02 00:34:53 +02001503static int
Victor Stinner488fa492011-12-12 00:01:39 +01001504unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001505{
Victor Stinner488fa492011-12-12 00:01:39 +01001506 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001507 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001508 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001509 return -1;
1510 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001511 return 0;
1512}
1513
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001514static int
1515_copy_characters(PyObject *to, Py_ssize_t to_start,
1516 PyObject *from, Py_ssize_t from_start,
1517 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001518{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001519 unsigned int from_kind, to_kind;
1520 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001521
Victor Stinneree4544c2012-05-09 22:24:08 +02001522 assert(0 <= how_many);
1523 assert(0 <= from_start);
1524 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001525 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001526 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001527 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001528
Victor Stinnerd3f08822012-05-29 12:57:52 +02001529 assert(PyUnicode_Check(to));
1530 assert(PyUnicode_IS_READY(to));
1531 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1532
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001533 if (how_many == 0)
1534 return 0;
1535
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001536 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001537 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001538 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001539 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540
Victor Stinnerf1852262012-06-16 16:38:26 +02001541#ifdef Py_DEBUG
1542 if (!check_maxchar
1543 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1544 {
1545 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1546 Py_UCS4 ch;
1547 Py_ssize_t i;
1548 for (i=0; i < how_many; i++) {
1549 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1550 assert(ch <= to_maxchar);
1551 }
1552 }
1553#endif
1554
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001555 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001556 if (check_maxchar
1557 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1558 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 /* Writing Latin-1 characters into an ASCII string requires to
1560 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001561 Py_UCS4 max_char;
1562 max_char = ucs1lib_find_max_char(from_data,
1563 (Py_UCS1*)from_data + how_many);
1564 if (max_char >= 128)
1565 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001566 }
Christian Heimesf051e432016-09-13 20:22:02 +02001567 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001568 (char*)from_data + from_kind * from_start,
1569 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001570 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001571 else if (from_kind == PyUnicode_1BYTE_KIND
1572 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001573 {
1574 _PyUnicode_CONVERT_BYTES(
1575 Py_UCS1, Py_UCS2,
1576 PyUnicode_1BYTE_DATA(from) + from_start,
1577 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1578 PyUnicode_2BYTE_DATA(to) + to_start
1579 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001580 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001581 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001582 && to_kind == PyUnicode_4BYTE_KIND)
1583 {
1584 _PyUnicode_CONVERT_BYTES(
1585 Py_UCS1, Py_UCS4,
1586 PyUnicode_1BYTE_DATA(from) + from_start,
1587 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1588 PyUnicode_4BYTE_DATA(to) + to_start
1589 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001590 }
1591 else if (from_kind == PyUnicode_2BYTE_KIND
1592 && to_kind == PyUnicode_4BYTE_KIND)
1593 {
1594 _PyUnicode_CONVERT_BYTES(
1595 Py_UCS2, Py_UCS4,
1596 PyUnicode_2BYTE_DATA(from) + from_start,
1597 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1598 PyUnicode_4BYTE_DATA(to) + to_start
1599 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001600 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001601 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001602 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1603
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001604 if (!check_maxchar) {
1605 if (from_kind == PyUnicode_2BYTE_KIND
1606 && to_kind == PyUnicode_1BYTE_KIND)
1607 {
1608 _PyUnicode_CONVERT_BYTES(
1609 Py_UCS2, Py_UCS1,
1610 PyUnicode_2BYTE_DATA(from) + from_start,
1611 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1612 PyUnicode_1BYTE_DATA(to) + to_start
1613 );
1614 }
1615 else if (from_kind == PyUnicode_4BYTE_KIND
1616 && to_kind == PyUnicode_1BYTE_KIND)
1617 {
1618 _PyUnicode_CONVERT_BYTES(
1619 Py_UCS4, Py_UCS1,
1620 PyUnicode_4BYTE_DATA(from) + from_start,
1621 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1622 PyUnicode_1BYTE_DATA(to) + to_start
1623 );
1624 }
1625 else if (from_kind == PyUnicode_4BYTE_KIND
1626 && to_kind == PyUnicode_2BYTE_KIND)
1627 {
1628 _PyUnicode_CONVERT_BYTES(
1629 Py_UCS4, Py_UCS2,
1630 PyUnicode_4BYTE_DATA(from) + from_start,
1631 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1632 PyUnicode_2BYTE_DATA(to) + to_start
1633 );
1634 }
1635 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001636 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001637 }
1638 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001639 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001640 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001641 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001642 Py_ssize_t i;
1643
Victor Stinnera0702ab2011-09-29 14:14:38 +02001644 for (i=0; i < how_many; i++) {
1645 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001646 if (ch > to_maxchar)
1647 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001648 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1649 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001650 }
1651 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001652 return 0;
1653}
1654
Victor Stinnerd3f08822012-05-29 12:57:52 +02001655void
1656_PyUnicode_FastCopyCharacters(
1657 PyObject *to, Py_ssize_t to_start,
1658 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001659{
1660 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1661}
1662
1663Py_ssize_t
1664PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1665 PyObject *from, Py_ssize_t from_start,
1666 Py_ssize_t how_many)
1667{
1668 int err;
1669
1670 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1671 PyErr_BadInternalCall();
1672 return -1;
1673 }
1674
Benjamin Petersonbac79492012-01-14 13:34:47 -05001675 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001676 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001677 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001678 return -1;
1679
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001680 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001681 PyErr_SetString(PyExc_IndexError, "string index out of range");
1682 return -1;
1683 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001684 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001685 PyErr_SetString(PyExc_IndexError, "string index out of range");
1686 return -1;
1687 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001688 if (how_many < 0) {
1689 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1690 return -1;
1691 }
1692 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001693 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1694 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001695 "Cannot write %zi characters at %zi "
1696 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 how_many, to_start, PyUnicode_GET_LENGTH(to));
1698 return -1;
1699 }
1700
1701 if (how_many == 0)
1702 return 0;
1703
Victor Stinner488fa492011-12-12 00:01:39 +01001704 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001705 return -1;
1706
1707 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1708 if (err) {
1709 PyErr_Format(PyExc_SystemError,
1710 "Cannot copy %s characters "
1711 "into a string of %s characters",
1712 unicode_kind_name(from),
1713 unicode_kind_name(to));
1714 return -1;
1715 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001716 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001717}
1718
Victor Stinner17222162011-09-28 22:15:37 +02001719/* Find the maximum code point and count the number of surrogate pairs so a
1720 correct string length can be computed before converting a string to UCS4.
1721 This function counts single surrogates as a character and not as a pair.
1722
1723 Return 0 on success, or -1 on error. */
1724static int
1725find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1726 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001727{
1728 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001729 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001730
Victor Stinnerc53be962011-10-02 21:33:54 +02001731 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001732 *num_surrogates = 0;
1733 *maxchar = 0;
1734
1735 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001737 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1738 && (iter+1) < end
1739 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1740 {
1741 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1742 ++(*num_surrogates);
1743 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 }
1745 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001746#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001747 {
1748 ch = *iter;
1749 iter++;
1750 }
1751 if (ch > *maxchar) {
1752 *maxchar = ch;
1753 if (*maxchar > MAX_UNICODE) {
1754 PyErr_Format(PyExc_ValueError,
1755 "character U+%x is not in range [U+0000; U+10ffff]",
1756 ch);
1757 return -1;
1758 }
1759 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001760 }
1761 return 0;
1762}
1763
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001764int
1765_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001766{
1767 wchar_t *end;
1768 Py_UCS4 maxchar = 0;
1769 Py_ssize_t num_surrogates;
1770#if SIZEOF_WCHAR_T == 2
1771 Py_ssize_t length_wo_surrogates;
1772#endif
1773
Georg Brandl7597add2011-10-05 16:36:47 +02001774 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001775 strings were created using _PyObject_New() and where no canonical
1776 representation (the str field) has been set yet aka strings
1777 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001778 assert(_PyUnicode_CHECK(unicode));
1779 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001780 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001781 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001782 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001783 /* Actually, it should neither be interned nor be anything else: */
1784 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001785
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001786 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001787 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001788 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790
1791 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001792 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1793 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794 PyErr_NoMemory();
1795 return -1;
1796 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001797 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 _PyUnicode_WSTR(unicode), end,
1799 PyUnicode_1BYTE_DATA(unicode));
1800 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1801 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1802 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1803 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001804 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001805 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001806 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001807 }
1808 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001809 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001810 _PyUnicode_UTF8(unicode) = NULL;
1811 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001812 }
1813 PyObject_FREE(_PyUnicode_WSTR(unicode));
1814 _PyUnicode_WSTR(unicode) = NULL;
1815 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1816 }
1817 /* In this case we might have to convert down from 4-byte native
1818 wchar_t to 2-byte unicode. */
1819 else if (maxchar < 65536) {
1820 assert(num_surrogates == 0 &&
1821 "FindMaxCharAndNumSurrogatePairs() messed up");
1822
Victor Stinner506f5922011-09-28 22:34:18 +02001823#if SIZEOF_WCHAR_T == 2
1824 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001825 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001826 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1827 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1828 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001829 _PyUnicode_UTF8(unicode) = NULL;
1830 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001831#else
1832 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001833 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001834 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001835 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001836 PyErr_NoMemory();
1837 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001838 }
Victor Stinner506f5922011-09-28 22:34:18 +02001839 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1840 _PyUnicode_WSTR(unicode), end,
1841 PyUnicode_2BYTE_DATA(unicode));
1842 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1843 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1844 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001845 _PyUnicode_UTF8(unicode) = NULL;
1846 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001847 PyObject_FREE(_PyUnicode_WSTR(unicode));
1848 _PyUnicode_WSTR(unicode) = NULL;
1849 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1850#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001851 }
1852 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1853 else {
1854#if SIZEOF_WCHAR_T == 2
1855 /* in case the native representation is 2-bytes, we need to allocate a
1856 new normalized 4-byte version. */
1857 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001858 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1859 PyErr_NoMemory();
1860 return -1;
1861 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001862 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1863 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001864 PyErr_NoMemory();
1865 return -1;
1866 }
1867 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1868 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001869 _PyUnicode_UTF8(unicode) = NULL;
1870 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001871 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1872 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001873 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001874 PyObject_FREE(_PyUnicode_WSTR(unicode));
1875 _PyUnicode_WSTR(unicode) = NULL;
1876 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1877#else
1878 assert(num_surrogates == 0);
1879
Victor Stinnerc3c74152011-10-02 20:39:55 +02001880 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001881 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001882 _PyUnicode_UTF8(unicode) = NULL;
1883 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001884 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1885#endif
1886 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1887 }
1888 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001889 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001890 return 0;
1891}
1892
Alexander Belopolsky40018472011-02-26 01:02:56 +00001893static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001894unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001895{
Walter Dörwald16807132007-05-25 13:52:07 +00001896 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001897 case SSTATE_NOT_INTERNED:
1898 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001899
Benjamin Peterson29060642009-01-31 22:14:21 +00001900 case SSTATE_INTERNED_MORTAL:
1901 /* revive dead object temporarily for DelItem */
1902 Py_REFCNT(unicode) = 3;
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001903 if (PyDict_DelItem(interned, unicode) != 0) {
1904 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1905 NULL);
1906 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001907 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001908
Benjamin Peterson29060642009-01-31 22:14:21 +00001909 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001910 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1911 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001912
Benjamin Peterson29060642009-01-31 22:14:21 +00001913 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001914 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001915 }
1916
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001917 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001918 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001919 }
1920 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001921 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001922 }
1923 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001924 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001925 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001926
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001927 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001928}
1929
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001930#ifdef Py_DEBUG
1931static int
1932unicode_is_singleton(PyObject *unicode)
1933{
1934 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1935 if (unicode == unicode_empty)
1936 return 1;
1937 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1938 {
1939 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1940 if (ch < 256 && unicode_latin1[ch] == unicode)
1941 return 1;
1942 }
1943 return 0;
1944}
1945#endif
1946
Alexander Belopolsky40018472011-02-26 01:02:56 +00001947static int
Victor Stinner488fa492011-12-12 00:01:39 +01001948unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001949{
Victor Stinner488fa492011-12-12 00:01:39 +01001950 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001951 if (Py_REFCNT(unicode) != 1)
1952 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001953 if (_PyUnicode_HASH(unicode) != -1)
1954 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955 if (PyUnicode_CHECK_INTERNED(unicode))
1956 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001957 if (!PyUnicode_CheckExact(unicode))
1958 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001959#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001960 /* singleton refcount is greater than 1 */
1961 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001962#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001963 return 1;
1964}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001965
Victor Stinnerfe226c02011-10-03 03:52:20 +02001966static int
1967unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1968{
1969 PyObject *unicode;
1970 Py_ssize_t old_length;
1971
1972 assert(p_unicode != NULL);
1973 unicode = *p_unicode;
1974
1975 assert(unicode != NULL);
1976 assert(PyUnicode_Check(unicode));
1977 assert(0 <= length);
1978
Victor Stinner910337b2011-10-03 03:20:16 +02001979 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001980 old_length = PyUnicode_WSTR_LENGTH(unicode);
1981 else
1982 old_length = PyUnicode_GET_LENGTH(unicode);
1983 if (old_length == length)
1984 return 0;
1985
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001986 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001987 _Py_INCREF_UNICODE_EMPTY();
1988 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001989 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001990 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001991 return 0;
1992 }
1993
Victor Stinner488fa492011-12-12 00:01:39 +01001994 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001995 PyObject *copy = resize_copy(unicode, length);
1996 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001997 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001998 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001999 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002000 }
2001
Victor Stinnerfe226c02011-10-03 03:52:20 +02002002 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002003 PyObject *new_unicode = resize_compact(unicode, length);
2004 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002005 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002006 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002007 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002008 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002009 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002010}
2011
Alexander Belopolsky40018472011-02-26 01:02:56 +00002012int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002014{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002015 PyObject *unicode;
2016 if (p_unicode == NULL) {
2017 PyErr_BadInternalCall();
2018 return -1;
2019 }
2020 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002021 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002022 {
2023 PyErr_BadInternalCall();
2024 return -1;
2025 }
2026 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002027}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002028
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002029/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002030
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002031 WARNING: The function doesn't copy the terminating null character and
2032 doesn't check the maximum character (may write a latin1 character in an
2033 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002034static void
2035unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2036 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002037{
2038 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2039 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002040 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002041
2042 switch (kind) {
2043 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002044 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002045#ifdef Py_DEBUG
2046 if (PyUnicode_IS_ASCII(unicode)) {
2047 Py_UCS4 maxchar = ucs1lib_find_max_char(
2048 (const Py_UCS1*)str,
2049 (const Py_UCS1*)str + len);
2050 assert(maxchar < 128);
2051 }
2052#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002053 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002054 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002055 }
2056 case PyUnicode_2BYTE_KIND: {
2057 Py_UCS2 *start = (Py_UCS2 *)data + index;
2058 Py_UCS2 *ucs2 = start;
2059 assert(index <= PyUnicode_GET_LENGTH(unicode));
2060
Victor Stinner184252a2012-06-16 02:57:41 +02002061 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002062 *ucs2 = (Py_UCS2)*str;
2063
2064 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002065 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 }
2067 default: {
2068 Py_UCS4 *start = (Py_UCS4 *)data + index;
2069 Py_UCS4 *ucs4 = start;
2070 assert(kind == PyUnicode_4BYTE_KIND);
2071 assert(index <= PyUnicode_GET_LENGTH(unicode));
2072
Victor Stinner184252a2012-06-16 02:57:41 +02002073 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002074 *ucs4 = (Py_UCS4)*str;
2075
2076 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002077 }
2078 }
2079}
2080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002081static PyObject*
2082get_latin1_char(unsigned char ch)
2083{
Victor Stinnera464fc12011-10-02 20:39:30 +02002084 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002086 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002087 if (!unicode)
2088 return NULL;
2089 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002090 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 unicode_latin1[ch] = unicode;
2092 }
2093 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002094 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095}
2096
Victor Stinner985a82a2014-01-03 12:53:47 +01002097static PyObject*
2098unicode_char(Py_UCS4 ch)
2099{
2100 PyObject *unicode;
2101
2102 assert(ch <= MAX_UNICODE);
2103
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002104 if (ch < 256)
2105 return get_latin1_char(ch);
2106
Victor Stinner985a82a2014-01-03 12:53:47 +01002107 unicode = PyUnicode_New(1, ch);
2108 if (unicode == NULL)
2109 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002110
2111 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2112 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002113 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002114 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002115 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2116 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2117 }
2118 assert(_PyUnicode_CheckConsistency(unicode, 1));
2119 return unicode;
2120}
2121
Alexander Belopolsky40018472011-02-26 01:02:56 +00002122PyObject *
2123PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002124{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002125 if (u == NULL)
2126 return (PyObject*)_PyUnicode_New(size);
2127
2128 if (size < 0) {
2129 PyErr_BadInternalCall();
2130 return NULL;
2131 }
2132
2133 return PyUnicode_FromWideChar(u, size);
2134}
2135
2136PyObject *
2137PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2138{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002139 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140 Py_UCS4 maxchar = 0;
2141 Py_ssize_t num_surrogates;
2142
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002143 if (u == NULL && size != 0) {
2144 PyErr_BadInternalCall();
2145 return NULL;
2146 }
2147
2148 if (size == -1) {
2149 size = wcslen(u);
2150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002152 /* If the Unicode data is known at construction time, we can apply
2153 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002154
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002155 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002156 if (size == 0)
2157 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 /* Single character Unicode objects in the Latin-1 range are
2160 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002161 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002162 return get_latin1_char((unsigned char)*u);
2163
2164 /* If not empty and not single character, copy the Unicode data
2165 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002166 if (find_maxchar_surrogates(u, u + size,
2167 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002168 return NULL;
2169
Victor Stinner8faf8212011-12-08 22:14:11 +01002170 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002171 if (!unicode)
2172 return NULL;
2173
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002174 switch (PyUnicode_KIND(unicode)) {
2175 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002176 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002177 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2178 break;
2179 case PyUnicode_2BYTE_KIND:
2180#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002181 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002182#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002183 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002184 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2185#endif
2186 break;
2187 case PyUnicode_4BYTE_KIND:
2188#if SIZEOF_WCHAR_T == 2
2189 /* This is the only case which has to process surrogates, thus
2190 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002191 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002192#else
2193 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002194 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195#endif
2196 break;
2197 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002198 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002200
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002201 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002202}
2203
Alexander Belopolsky40018472011-02-26 01:02:56 +00002204PyObject *
2205PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002206{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002207 if (size < 0) {
2208 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002209 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002210 return NULL;
2211 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002212 if (u != NULL)
2213 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2214 else
2215 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002216}
2217
Alexander Belopolsky40018472011-02-26 01:02:56 +00002218PyObject *
2219PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002220{
2221 size_t size = strlen(u);
2222 if (size > PY_SSIZE_T_MAX) {
2223 PyErr_SetString(PyExc_OverflowError, "input too long");
2224 return NULL;
2225 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002226 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002227}
2228
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002229PyObject *
2230_PyUnicode_FromId(_Py_Identifier *id)
2231{
2232 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002233 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2234 strlen(id->string),
2235 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002236 if (!id->object)
2237 return NULL;
2238 PyUnicode_InternInPlace(&id->object);
2239 assert(!id->next);
2240 id->next = static_strings;
2241 static_strings = id;
2242 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002243 return id->object;
2244}
2245
2246void
2247_PyUnicode_ClearStaticStrings()
2248{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002249 _Py_Identifier *tmp, *s = static_strings;
2250 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002251 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002252 tmp = s->next;
2253 s->next = NULL;
2254 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002255 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002256 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002257}
2258
Benjamin Peterson0df54292012-03-26 14:50:32 -04002259/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002260
Victor Stinnerd3f08822012-05-29 12:57:52 +02002261PyObject*
2262_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002263{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002264 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002265 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002266 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002267#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002268 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002269#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002270 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002271 }
Victor Stinner785938e2011-12-11 20:09:03 +01002272 unicode = PyUnicode_New(size, 127);
2273 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002274 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002275 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2276 assert(_PyUnicode_CheckConsistency(unicode, 1));
2277 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002278}
2279
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002280static Py_UCS4
2281kind_maxchar_limit(unsigned int kind)
2282{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002283 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002284 case PyUnicode_1BYTE_KIND:
2285 return 0x80;
2286 case PyUnicode_2BYTE_KIND:
2287 return 0x100;
2288 case PyUnicode_4BYTE_KIND:
2289 return 0x10000;
2290 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002291 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002292 }
2293}
2294
Victor Stinner702c7342011-10-05 13:50:52 +02002295static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002296_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002298 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002299 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002300
Serhiy Storchaka678db842013-01-26 12:16:36 +02002301 if (size == 0)
2302 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002303 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002304 if (size == 1)
2305 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002306
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002307 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002308 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 if (!res)
2310 return NULL;
2311 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002312 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002314}
2315
Victor Stinnere57b1c02011-09-28 22:20:48 +02002316static PyObject*
2317_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002318{
2319 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002320 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002321
Serhiy Storchaka678db842013-01-26 12:16:36 +02002322 if (size == 0)
2323 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002324 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002325 if (size == 1)
2326 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002327
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002328 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002329 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002330 if (!res)
2331 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002332 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002334 else {
2335 _PyUnicode_CONVERT_BYTES(
2336 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2337 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002338 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002339 return res;
2340}
2341
Victor Stinnere57b1c02011-09-28 22:20:48 +02002342static PyObject*
2343_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344{
2345 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002346 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002347
Serhiy Storchaka678db842013-01-26 12:16:36 +02002348 if (size == 0)
2349 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002350 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002351 if (size == 1)
2352 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002353
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002354 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002355 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002356 if (!res)
2357 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002358 if (max_char < 256)
2359 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2360 PyUnicode_1BYTE_DATA(res));
2361 else if (max_char < 0x10000)
2362 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2363 PyUnicode_2BYTE_DATA(res));
2364 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002365 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002366 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 return res;
2368}
2369
2370PyObject*
2371PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2372{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002373 if (size < 0) {
2374 PyErr_SetString(PyExc_ValueError, "size must be positive");
2375 return NULL;
2376 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002377 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002379 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002380 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002381 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002383 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002384 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002385 PyErr_SetString(PyExc_SystemError, "invalid kind");
2386 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002388}
2389
Victor Stinnerece58de2012-04-23 23:36:38 +02002390Py_UCS4
2391_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2392{
2393 enum PyUnicode_Kind kind;
2394 void *startptr, *endptr;
2395
2396 assert(PyUnicode_IS_READY(unicode));
2397 assert(0 <= start);
2398 assert(end <= PyUnicode_GET_LENGTH(unicode));
2399 assert(start <= end);
2400
2401 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2402 return PyUnicode_MAX_CHAR_VALUE(unicode);
2403
2404 if (start == end)
2405 return 127;
2406
Victor Stinner94d558b2012-04-27 22:26:58 +02002407 if (PyUnicode_IS_ASCII(unicode))
2408 return 127;
2409
Victor Stinnerece58de2012-04-23 23:36:38 +02002410 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002411 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002412 endptr = (char *)startptr + end * kind;
2413 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002414 switch(kind) {
2415 case PyUnicode_1BYTE_KIND:
2416 return ucs1lib_find_max_char(startptr, endptr);
2417 case PyUnicode_2BYTE_KIND:
2418 return ucs2lib_find_max_char(startptr, endptr);
2419 case PyUnicode_4BYTE_KIND:
2420 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002421 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002422 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002423 }
2424}
2425
Victor Stinner25a4b292011-10-06 12:31:55 +02002426/* Ensure that a string uses the most efficient storage, if it is not the
2427 case: create a new string with of the right kind. Write NULL into *p_unicode
2428 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002429static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002430unicode_adjust_maxchar(PyObject **p_unicode)
2431{
2432 PyObject *unicode, *copy;
2433 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002434 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002435 unsigned int kind;
2436
2437 assert(p_unicode != NULL);
2438 unicode = *p_unicode;
2439 assert(PyUnicode_IS_READY(unicode));
2440 if (PyUnicode_IS_ASCII(unicode))
2441 return;
2442
2443 len = PyUnicode_GET_LENGTH(unicode);
2444 kind = PyUnicode_KIND(unicode);
2445 if (kind == PyUnicode_1BYTE_KIND) {
2446 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002447 max_char = ucs1lib_find_max_char(u, u + len);
2448 if (max_char >= 128)
2449 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002450 }
2451 else if (kind == PyUnicode_2BYTE_KIND) {
2452 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002453 max_char = ucs2lib_find_max_char(u, u + len);
2454 if (max_char >= 256)
2455 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002456 }
2457 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002458 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002459 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002460 max_char = ucs4lib_find_max_char(u, u + len);
2461 if (max_char >= 0x10000)
2462 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002463 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002464 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002465 if (copy != NULL)
2466 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002467 Py_DECREF(unicode);
2468 *p_unicode = copy;
2469}
2470
Victor Stinner034f6cf2011-09-30 02:26:44 +02002471PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002472_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002473{
Victor Stinner87af4f22011-11-21 23:03:47 +01002474 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002475 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002476
Victor Stinner034f6cf2011-09-30 02:26:44 +02002477 if (!PyUnicode_Check(unicode)) {
2478 PyErr_BadInternalCall();
2479 return NULL;
2480 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002481 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002482 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002483
Victor Stinner87af4f22011-11-21 23:03:47 +01002484 length = PyUnicode_GET_LENGTH(unicode);
2485 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002486 if (!copy)
2487 return NULL;
2488 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2489
Christian Heimesf051e432016-09-13 20:22:02 +02002490 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002491 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002492 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002493 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002494}
2495
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002496
Victor Stinnerbc603d12011-10-02 01:00:40 +02002497/* Widen Unicode objects to larger buffers. Don't write terminating null
2498 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002499
2500void*
2501_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2502{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002503 Py_ssize_t len;
2504 void *result;
2505 unsigned int skind;
2506
Benjamin Petersonbac79492012-01-14 13:34:47 -05002507 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002508 return NULL;
2509
2510 len = PyUnicode_GET_LENGTH(s);
2511 skind = PyUnicode_KIND(s);
2512 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002513 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002514 return NULL;
2515 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002516 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002517 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002518 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002519 if (!result)
2520 return PyErr_NoMemory();
2521 assert(skind == PyUnicode_1BYTE_KIND);
2522 _PyUnicode_CONVERT_BYTES(
2523 Py_UCS1, Py_UCS2,
2524 PyUnicode_1BYTE_DATA(s),
2525 PyUnicode_1BYTE_DATA(s) + len,
2526 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002527 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002528 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002529 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002530 if (!result)
2531 return PyErr_NoMemory();
2532 if (skind == PyUnicode_2BYTE_KIND) {
2533 _PyUnicode_CONVERT_BYTES(
2534 Py_UCS2, Py_UCS4,
2535 PyUnicode_2BYTE_DATA(s),
2536 PyUnicode_2BYTE_DATA(s) + len,
2537 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002538 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002539 else {
2540 assert(skind == PyUnicode_1BYTE_KIND);
2541 _PyUnicode_CONVERT_BYTES(
2542 Py_UCS1, Py_UCS4,
2543 PyUnicode_1BYTE_DATA(s),
2544 PyUnicode_1BYTE_DATA(s) + len,
2545 result);
2546 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002547 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002548 default:
2549 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002550 }
Victor Stinner01698042011-10-04 00:04:26 +02002551 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002552 return NULL;
2553}
2554
2555static Py_UCS4*
2556as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2557 int copy_null)
2558{
2559 int kind;
2560 void *data;
2561 Py_ssize_t len, targetlen;
2562 if (PyUnicode_READY(string) == -1)
2563 return NULL;
2564 kind = PyUnicode_KIND(string);
2565 data = PyUnicode_DATA(string);
2566 len = PyUnicode_GET_LENGTH(string);
2567 targetlen = len;
2568 if (copy_null)
2569 targetlen++;
2570 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002571 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002572 if (!target) {
2573 PyErr_NoMemory();
2574 return NULL;
2575 }
2576 }
2577 else {
2578 if (targetsize < targetlen) {
2579 PyErr_Format(PyExc_SystemError,
2580 "string is longer than the buffer");
2581 if (copy_null && 0 < targetsize)
2582 target[0] = 0;
2583 return NULL;
2584 }
2585 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002586 if (kind == PyUnicode_1BYTE_KIND) {
2587 Py_UCS1 *start = (Py_UCS1 *) data;
2588 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002589 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002590 else if (kind == PyUnicode_2BYTE_KIND) {
2591 Py_UCS2 *start = (Py_UCS2 *) data;
2592 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2593 }
2594 else {
2595 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002596 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002597 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002598 if (copy_null)
2599 target[len] = 0;
2600 return target;
2601}
2602
2603Py_UCS4*
2604PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2605 int copy_null)
2606{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002607 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002608 PyErr_BadInternalCall();
2609 return NULL;
2610 }
2611 return as_ucs4(string, target, targetsize, copy_null);
2612}
2613
2614Py_UCS4*
2615PyUnicode_AsUCS4Copy(PyObject *string)
2616{
2617 return as_ucs4(string, NULL, 0, 1);
2618}
2619
Victor Stinner15a11362012-10-06 23:48:20 +02002620/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002621 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2622 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2623#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002624
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002625static int
2626unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2627 Py_ssize_t width, Py_ssize_t precision)
2628{
2629 Py_ssize_t length, fill, arglen;
2630 Py_UCS4 maxchar;
2631
2632 if (PyUnicode_READY(str) == -1)
2633 return -1;
2634
2635 length = PyUnicode_GET_LENGTH(str);
2636 if ((precision == -1 || precision >= length)
2637 && width <= length)
2638 return _PyUnicodeWriter_WriteStr(writer, str);
2639
2640 if (precision != -1)
2641 length = Py_MIN(precision, length);
2642
2643 arglen = Py_MAX(length, width);
2644 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2645 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2646 else
2647 maxchar = writer->maxchar;
2648
2649 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2650 return -1;
2651
2652 if (width > length) {
2653 fill = width - length;
2654 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2655 return -1;
2656 writer->pos += fill;
2657 }
2658
2659 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2660 str, 0, length);
2661 writer->pos += length;
2662 return 0;
2663}
2664
2665static int
Victor Stinner998b8062018-09-12 00:23:25 +02002666unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 Py_ssize_t width, Py_ssize_t precision)
2668{
2669 /* UTF-8 */
2670 Py_ssize_t length;
2671 PyObject *unicode;
2672 int res;
2673
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002674 if (precision == -1) {
2675 length = strlen(str);
2676 }
2677 else {
2678 length = 0;
2679 while (length < precision && str[length]) {
2680 length++;
2681 }
2682 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002683 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2684 if (unicode == NULL)
2685 return -1;
2686
2687 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2688 Py_DECREF(unicode);
2689 return res;
2690}
2691
Victor Stinner96865452011-03-01 23:44:09 +00002692static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002693unicode_fromformat_arg(_PyUnicodeWriter *writer,
2694 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002695{
Victor Stinnere215d962012-10-06 23:03:36 +02002696 const char *p;
2697 Py_ssize_t len;
2698 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002699 Py_ssize_t width;
2700 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002701 int longflag;
2702 int longlongflag;
2703 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002704 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002705
2706 p = f;
2707 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002708 zeropad = 0;
2709 if (*f == '0') {
2710 zeropad = 1;
2711 f++;
2712 }
Victor Stinner96865452011-03-01 23:44:09 +00002713
2714 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002715 width = -1;
2716 if (Py_ISDIGIT((unsigned)*f)) {
2717 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002718 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002719 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002720 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002721 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002722 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002723 return NULL;
2724 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002725 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002726 f++;
2727 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002728 }
2729 precision = -1;
2730 if (*f == '.') {
2731 f++;
2732 if (Py_ISDIGIT((unsigned)*f)) {
2733 precision = (*f - '0');
2734 f++;
2735 while (Py_ISDIGIT((unsigned)*f)) {
2736 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2737 PyErr_SetString(PyExc_ValueError,
2738 "precision too big");
2739 return NULL;
2740 }
2741 precision = (precision * 10) + (*f - '0');
2742 f++;
2743 }
2744 }
Victor Stinner96865452011-03-01 23:44:09 +00002745 if (*f == '%') {
2746 /* "%.3%s" => f points to "3" */
2747 f--;
2748 }
2749 }
2750 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002751 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002752 f--;
2753 }
Victor Stinner96865452011-03-01 23:44:09 +00002754
2755 /* Handle %ld, %lu, %lld and %llu. */
2756 longflag = 0;
2757 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002758 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002759 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002760 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002761 longflag = 1;
2762 ++f;
2763 }
Victor Stinner96865452011-03-01 23:44:09 +00002764 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002765 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002766 longlongflag = 1;
2767 f += 2;
2768 }
Victor Stinner96865452011-03-01 23:44:09 +00002769 }
2770 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002771 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002772 size_tflag = 1;
2773 ++f;
2774 }
Victor Stinnere215d962012-10-06 23:03:36 +02002775
2776 if (f[1] == '\0')
2777 writer->overallocate = 0;
2778
2779 switch (*f) {
2780 case 'c':
2781 {
2782 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002783 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002784 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002785 "character argument not in range(0x110000)");
2786 return NULL;
2787 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002788 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002789 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002790 break;
2791 }
2792
2793 case 'i':
2794 case 'd':
2795 case 'u':
2796 case 'x':
2797 {
2798 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002799 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002800 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002801
2802 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002803 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002804 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002805 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002806 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002807 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002808 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002809 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002810 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002811 va_arg(*vargs, size_t));
2812 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002813 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002814 va_arg(*vargs, unsigned int));
2815 }
2816 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002817 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002818 }
2819 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002820 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002821 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002822 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002823 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002824 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002825 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002826 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002827 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002828 va_arg(*vargs, Py_ssize_t));
2829 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002830 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002831 va_arg(*vargs, int));
2832 }
2833 assert(len >= 0);
2834
Victor Stinnere215d962012-10-06 23:03:36 +02002835 if (precision < len)
2836 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002837
2838 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002839 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2840 return NULL;
2841
Victor Stinnere215d962012-10-06 23:03:36 +02002842 if (width > precision) {
2843 Py_UCS4 fillchar;
2844 fill = width - precision;
2845 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002846 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2847 return NULL;
2848 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002849 }
Victor Stinner15a11362012-10-06 23:48:20 +02002850 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002851 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002852 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2853 return NULL;
2854 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002855 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002856
Victor Stinner4a587072013-11-19 12:54:53 +01002857 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2858 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002859 break;
2860 }
2861
2862 case 'p':
2863 {
2864 char number[MAX_LONG_LONG_CHARS];
2865
2866 len = sprintf(number, "%p", va_arg(*vargs, void*));
2867 assert(len >= 0);
2868
2869 /* %p is ill-defined: ensure leading 0x. */
2870 if (number[1] == 'X')
2871 number[1] = 'x';
2872 else if (number[1] != 'x') {
2873 memmove(number + 2, number,
2874 strlen(number) + 1);
2875 number[0] = '0';
2876 number[1] = 'x';
2877 len += 2;
2878 }
2879
Victor Stinner4a587072013-11-19 12:54:53 +01002880 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002881 return NULL;
2882 break;
2883 }
2884
2885 case 's':
2886 {
2887 /* UTF-8 */
2888 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002889 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002890 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002891 break;
2892 }
2893
2894 case 'U':
2895 {
2896 PyObject *obj = va_arg(*vargs, PyObject *);
2897 assert(obj && _PyUnicode_CHECK(obj));
2898
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002899 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002900 return NULL;
2901 break;
2902 }
2903
2904 case 'V':
2905 {
2906 PyObject *obj = va_arg(*vargs, PyObject *);
2907 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002908 if (obj) {
2909 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002910 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002911 return NULL;
2912 }
2913 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002914 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002915 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002916 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 }
2918 break;
2919 }
2920
2921 case 'S':
2922 {
2923 PyObject *obj = va_arg(*vargs, PyObject *);
2924 PyObject *str;
2925 assert(obj);
2926 str = PyObject_Str(obj);
2927 if (!str)
2928 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002929 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002930 Py_DECREF(str);
2931 return NULL;
2932 }
2933 Py_DECREF(str);
2934 break;
2935 }
2936
2937 case 'R':
2938 {
2939 PyObject *obj = va_arg(*vargs, PyObject *);
2940 PyObject *repr;
2941 assert(obj);
2942 repr = PyObject_Repr(obj);
2943 if (!repr)
2944 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002945 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002946 Py_DECREF(repr);
2947 return NULL;
2948 }
2949 Py_DECREF(repr);
2950 break;
2951 }
2952
2953 case 'A':
2954 {
2955 PyObject *obj = va_arg(*vargs, PyObject *);
2956 PyObject *ascii;
2957 assert(obj);
2958 ascii = PyObject_ASCII(obj);
2959 if (!ascii)
2960 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002961 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002962 Py_DECREF(ascii);
2963 return NULL;
2964 }
2965 Py_DECREF(ascii);
2966 break;
2967 }
2968
2969 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002970 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002971 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002972 break;
2973
2974 default:
2975 /* if we stumble upon an unknown formatting code, copy the rest
2976 of the format string to the output string. (we cannot just
2977 skip the code, since there's no way to know what's in the
2978 argument list) */
2979 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002980 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002981 return NULL;
2982 f = p+len;
2983 return f;
2984 }
2985
2986 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002987 return f;
2988}
2989
Walter Dörwaldd2034312007-05-18 16:29:38 +00002990PyObject *
2991PyUnicode_FromFormatV(const char *format, va_list vargs)
2992{
Victor Stinnere215d962012-10-06 23:03:36 +02002993 va_list vargs2;
2994 const char *f;
2995 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002996
Victor Stinner8f674cc2013-04-17 23:02:17 +02002997 _PyUnicodeWriter_Init(&writer);
2998 writer.min_length = strlen(format) + 100;
2999 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003000
Benjamin Peterson0c212142016-09-20 20:39:33 -07003001 // Copy varags to be able to pass a reference to a subfunction.
3002 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003003
3004 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003005 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003006 f = unicode_fromformat_arg(&writer, f, &vargs2);
3007 if (f == NULL)
3008 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003009 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003010 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003011 const char *p;
3012 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003013
Victor Stinnere215d962012-10-06 23:03:36 +02003014 p = f;
3015 do
3016 {
3017 if ((unsigned char)*p > 127) {
3018 PyErr_Format(PyExc_ValueError,
3019 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3020 "string, got a non-ASCII byte: 0x%02x",
3021 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003022 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003023 }
3024 p++;
3025 }
3026 while (*p != '\0' && *p != '%');
3027 len = p - f;
3028
3029 if (*p == '\0')
3030 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003031
3032 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003033 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003034
3035 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003036 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003037 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003038 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003039 return _PyUnicodeWriter_Finish(&writer);
3040
3041 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003042 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003043 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003044 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003045}
3046
Walter Dörwaldd2034312007-05-18 16:29:38 +00003047PyObject *
3048PyUnicode_FromFormat(const char *format, ...)
3049{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003050 PyObject* ret;
3051 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003052
3053#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003054 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003055#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003056 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003057#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 ret = PyUnicode_FromFormatV(format, vargs);
3059 va_end(vargs);
3060 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003061}
3062
Serhiy Storchakac46db922018-10-23 22:58:24 +03003063static Py_ssize_t
3064unicode_get_widechar_size(PyObject *unicode)
3065{
3066 Py_ssize_t res;
3067
3068 assert(unicode != NULL);
3069 assert(_PyUnicode_CHECK(unicode));
3070
3071 if (_PyUnicode_WSTR(unicode) != NULL) {
3072 return PyUnicode_WSTR_LENGTH(unicode);
3073 }
3074 assert(PyUnicode_IS_READY(unicode));
3075
3076 res = _PyUnicode_LENGTH(unicode);
3077#if SIZEOF_WCHAR_T == 2
3078 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3079 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3080 const Py_UCS4 *end = s + res;
3081 for (; s < end; ++s) {
3082 if (*s > 0xFFFF) {
3083 ++res;
3084 }
3085 }
3086 }
3087#endif
3088 return res;
3089}
3090
3091static void
3092unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3093{
3094 const wchar_t *wstr;
3095
3096 assert(unicode != NULL);
3097 assert(_PyUnicode_CHECK(unicode));
3098
3099 wstr = _PyUnicode_WSTR(unicode);
3100 if (wstr != NULL) {
3101 memcpy(w, wstr, size * sizeof(wchar_t));
3102 return;
3103 }
3104 assert(PyUnicode_IS_READY(unicode));
3105
3106 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3107 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3108 for (; size--; ++s, ++w) {
3109 *w = *s;
3110 }
3111 }
3112 else {
3113#if SIZEOF_WCHAR_T == 4
3114 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3115 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3116 for (; size--; ++s, ++w) {
3117 *w = *s;
3118 }
3119#else
3120 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3121 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3122 for (; size--; ++s, ++w) {
3123 Py_UCS4 ch = *s;
3124 if (ch > 0xFFFF) {
3125 assert(ch <= MAX_UNICODE);
3126 /* encode surrogate pair in this case */
3127 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3128 if (!size--)
3129 break;
3130 *w = Py_UNICODE_LOW_SURROGATE(ch);
3131 }
3132 else {
3133 *w = ch;
3134 }
3135 }
3136#endif
3137 }
3138}
3139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003140#ifdef HAVE_WCHAR_H
3141
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003142/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003143
Victor Stinnerd88d9832011-09-06 02:00:05 +02003144 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003145 character) required to convert the unicode object. Ignore size argument.
3146
Victor Stinnerd88d9832011-09-06 02:00:05 +02003147 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003148 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003149 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003150Py_ssize_t
3151PyUnicode_AsWideChar(PyObject *unicode,
3152 wchar_t *w,
3153 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003154{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003155 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003156
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003157 if (unicode == NULL) {
3158 PyErr_BadInternalCall();
3159 return -1;
3160 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003161 if (!PyUnicode_Check(unicode)) {
3162 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003163 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003164 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003165
3166 res = unicode_get_widechar_size(unicode);
3167 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003168 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003169 }
3170
3171 if (size > res) {
3172 size = res + 1;
3173 }
3174 else {
3175 res = size;
3176 }
3177 unicode_copy_as_widechar(unicode, w, size);
3178 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003179}
3180
Victor Stinner137c34c2010-09-29 10:25:54 +00003181wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003182PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003183 Py_ssize_t *size)
3184{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003185 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003186 Py_ssize_t buflen;
3187
3188 if (unicode == NULL) {
3189 PyErr_BadInternalCall();
3190 return NULL;
3191 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003192 if (!PyUnicode_Check(unicode)) {
3193 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003194 return NULL;
3195 }
3196
Serhiy Storchakac46db922018-10-23 22:58:24 +03003197 buflen = unicode_get_widechar_size(unicode);
3198 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003199 if (buffer == NULL) {
3200 PyErr_NoMemory();
3201 return NULL;
3202 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003203 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3204 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003205 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003206 }
3207 else if (wcslen(buffer) != (size_t)buflen) {
3208 PyMem_FREE(buffer);
3209 PyErr_SetString(PyExc_ValueError,
3210 "embedded null character");
3211 return NULL;
3212 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003213 return buffer;
3214}
3215
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003216#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003217
Alexander Belopolsky40018472011-02-26 01:02:56 +00003218PyObject *
3219PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003220{
Victor Stinner8faf8212011-12-08 22:14:11 +01003221 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003222 PyErr_SetString(PyExc_ValueError,
3223 "chr() arg not in range(0x110000)");
3224 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003225 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003226
Victor Stinner985a82a2014-01-03 12:53:47 +01003227 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003228}
3229
Alexander Belopolsky40018472011-02-26 01:02:56 +00003230PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003231PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003232{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003233 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003234 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003235 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003236 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003237 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 Py_INCREF(obj);
3239 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003240 }
3241 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 /* For a Unicode subtype that's not a Unicode object,
3243 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003244 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003245 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003246 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003247 "Can't convert '%.100s' object to str implicitly",
3248 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003249 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003250}
3251
Alexander Belopolsky40018472011-02-26 01:02:56 +00003252PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003253PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003254 const char *encoding,
3255 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003256{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003257 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003258 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003259
Guido van Rossumd57fd912000-03-10 22:53:23 +00003260 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003261 PyErr_BadInternalCall();
3262 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003263 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003264
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003265 /* Decoding bytes objects is the most common case and should be fast */
3266 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003267 if (PyBytes_GET_SIZE(obj) == 0) {
3268 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3269 return NULL;
3270 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003271 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003272 }
3273 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003274 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3275 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003276 }
3277
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003278 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003279 PyErr_SetString(PyExc_TypeError,
3280 "decoding str is not supported");
3281 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003282 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003283
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003284 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3285 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3286 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003287 "decoding to str: need a bytes-like object, %.80s found",
3288 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003289 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003290 }
Tim Petersced69f82003-09-16 20:30:58 +00003291
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003292 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003293 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003294 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3295 return NULL;
3296 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003297 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003298 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003299
Serhiy Storchaka05997252013-01-26 12:14:02 +02003300 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003301 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003302 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003303}
3304
Victor Stinnerebe17e02016-10-12 13:57:45 +02003305/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3306 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3307 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003308int
3309_Py_normalize_encoding(const char *encoding,
3310 char *lower,
3311 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003312{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003313 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003314 char *l;
3315 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003316 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003317
Victor Stinner942889a2016-09-05 15:40:10 -07003318 assert(encoding != NULL);
3319
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003320 e = encoding;
3321 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003322 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003323 punct = 0;
3324 while (1) {
3325 char c = *e;
3326 if (c == 0) {
3327 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003328 }
Victor Stinner942889a2016-09-05 15:40:10 -07003329
3330 if (Py_ISALNUM(c) || c == '.') {
3331 if (punct && l != lower) {
3332 if (l == l_end) {
3333 return 0;
3334 }
3335 *l++ = '_';
3336 }
3337 punct = 0;
3338
3339 if (l == l_end) {
3340 return 0;
3341 }
3342 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003343 }
3344 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003345 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003346 }
Victor Stinner942889a2016-09-05 15:40:10 -07003347
3348 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003349 }
3350 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003351 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003352}
3353
Alexander Belopolsky40018472011-02-26 01:02:56 +00003354PyObject *
3355PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003356 Py_ssize_t size,
3357 const char *encoding,
3358 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003359{
3360 PyObject *buffer = NULL, *unicode;
3361 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003362 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3363
Victor Stinner22eb6892019-06-26 00:51:05 +02003364 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3365 return NULL;
3366 }
3367
Victor Stinnered076ed2019-06-26 01:49:32 +02003368 if (size == 0) {
3369 _Py_RETURN_UNICODE_EMPTY();
3370 }
3371
Victor Stinner942889a2016-09-05 15:40:10 -07003372 if (encoding == NULL) {
3373 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3374 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003375
Fred Drakee4315f52000-05-09 19:53:39 +00003376 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003377 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3378 char *lower = buflower;
3379
3380 /* Fast paths */
3381 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3382 lower += 3;
3383 if (*lower == '_') {
3384 /* Match "utf8" and "utf_8" */
3385 lower++;
3386 }
3387
3388 if (lower[0] == '8' && lower[1] == 0) {
3389 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3390 }
3391 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3392 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3393 }
3394 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3395 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3396 }
3397 }
3398 else {
3399 if (strcmp(lower, "ascii") == 0
3400 || strcmp(lower, "us_ascii") == 0) {
3401 return PyUnicode_DecodeASCII(s, size, errors);
3402 }
Steve Dowercc16be82016-09-08 10:35:16 -07003403 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003404 else if (strcmp(lower, "mbcs") == 0) {
3405 return PyUnicode_DecodeMBCS(s, size, errors);
3406 }
3407 #endif
3408 else if (strcmp(lower, "latin1") == 0
3409 || strcmp(lower, "latin_1") == 0
3410 || strcmp(lower, "iso_8859_1") == 0
3411 || strcmp(lower, "iso8859_1") == 0) {
3412 return PyUnicode_DecodeLatin1(s, size, errors);
3413 }
3414 }
Victor Stinner37296e82010-06-10 13:36:23 +00003415 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003416
3417 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003418 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003419 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003420 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003421 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003422 if (buffer == NULL)
3423 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003424 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003425 if (unicode == NULL)
3426 goto onError;
3427 if (!PyUnicode_Check(unicode)) {
3428 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003429 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003430 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003431 encoding,
3432 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003433 Py_DECREF(unicode);
3434 goto onError;
3435 }
3436 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003437 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003438
Benjamin Peterson29060642009-01-31 22:14:21 +00003439 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003440 Py_XDECREF(buffer);
3441 return NULL;
3442}
3443
Alexander Belopolsky40018472011-02-26 01:02:56 +00003444PyObject *
3445PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003446 const char *encoding,
3447 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003448{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003449 if (!PyUnicode_Check(unicode)) {
3450 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003451 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003452 }
3453
Serhiy Storchaka00939072016-10-27 21:05:49 +03003454 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3455 "PyUnicode_AsDecodedObject() is deprecated; "
3456 "use PyCodec_Decode() to decode from str", 1) < 0)
3457 return NULL;
3458
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003459 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003460 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003461
3462 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003463 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003464}
3465
Alexander Belopolsky40018472011-02-26 01:02:56 +00003466PyObject *
3467PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003468 const char *encoding,
3469 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003470{
3471 PyObject *v;
3472
3473 if (!PyUnicode_Check(unicode)) {
3474 PyErr_BadArgument();
3475 goto onError;
3476 }
3477
Serhiy Storchaka00939072016-10-27 21:05:49 +03003478 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3479 "PyUnicode_AsDecodedUnicode() is deprecated; "
3480 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3481 return NULL;
3482
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003483 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003484 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003485
3486 /* Decode via the codec registry */
3487 v = PyCodec_Decode(unicode, encoding, errors);
3488 if (v == NULL)
3489 goto onError;
3490 if (!PyUnicode_Check(v)) {
3491 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003492 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003493 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003494 encoding,
3495 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003496 Py_DECREF(v);
3497 goto onError;
3498 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003499 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003500
Benjamin Peterson29060642009-01-31 22:14:21 +00003501 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003502 return NULL;
3503}
3504
Alexander Belopolsky40018472011-02-26 01:02:56 +00003505PyObject *
3506PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003507 Py_ssize_t size,
3508 const char *encoding,
3509 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003510{
3511 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003512
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003513 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003515 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3517 Py_DECREF(unicode);
3518 return v;
3519}
3520
Alexander Belopolsky40018472011-02-26 01:02:56 +00003521PyObject *
3522PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003523 const char *encoding,
3524 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003525{
3526 PyObject *v;
3527
3528 if (!PyUnicode_Check(unicode)) {
3529 PyErr_BadArgument();
3530 goto onError;
3531 }
3532
Serhiy Storchaka00939072016-10-27 21:05:49 +03003533 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3534 "PyUnicode_AsEncodedObject() is deprecated; "
3535 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3536 "or PyCodec_Encode() for generic encoding", 1) < 0)
3537 return NULL;
3538
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003539 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003540 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003541
3542 /* Encode via the codec registry */
3543 v = PyCodec_Encode(unicode, encoding, errors);
3544 if (v == NULL)
3545 goto onError;
3546 return v;
3547
Benjamin Peterson29060642009-01-31 22:14:21 +00003548 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003549 return NULL;
3550}
3551
Victor Stinner1b579672011-12-17 05:47:23 +01003552
Victor Stinner2cba6b82018-01-10 22:46:15 +01003553static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003554unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003555 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003556{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003557 Py_ssize_t wlen;
3558 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3559 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003560 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003561 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003562
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003563 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003564 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003565 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003566 return NULL;
3567 }
3568
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003569 char *str;
3570 size_t error_pos;
3571 const char *reason;
3572 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003573 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003574 PyMem_Free(wstr);
3575
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003576 if (res != 0) {
3577 if (res == -2) {
3578 PyObject *exc;
3579 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3580 "locale", unicode,
3581 (Py_ssize_t)error_pos,
3582 (Py_ssize_t)(error_pos+1),
3583 reason);
3584 if (exc != NULL) {
3585 PyCodec_StrictErrors(exc);
3586 Py_DECREF(exc);
3587 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003588 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003589 else if (res == -3) {
3590 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3591 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003592 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003593 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003594 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003595 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003596 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003597
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003598 PyObject *bytes = PyBytes_FromString(str);
3599 PyMem_RawFree(str);
3600 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601}
3602
Victor Stinnerad158722010-10-27 00:25:46 +00003603PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003604PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3605{
Victor Stinner709d23d2019-05-02 14:56:30 -04003606 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3607 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003608}
3609
3610PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003611PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003612{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003613 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003614#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003615 if (interp->fs_codec.encoding) {
3616 return unicode_encode_utf8(unicode,
3617 interp->fs_codec.error_handler,
3618 interp->fs_codec.errors);
3619 }
3620 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003621 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003622 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003623 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003624 assert(errors != _Py_ERROR_UNKNOWN);
3625 return unicode_encode_utf8(unicode, errors, NULL);
3626 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003627#else
Victor Stinner793b5312011-04-27 00:24:21 +02003628 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3629 cannot use it to encode and decode filenames before it is loaded. Load
3630 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003631 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003632 initialized and the Python codec is loaded.
3633 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003634 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003635 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003636 interp->fs_codec.encoding,
3637 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003638 }
3639 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003640 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003641 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003642 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003643 assert(errors != _Py_ERROR_UNKNOWN);
3644 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003645 }
Victor Stinnerad158722010-10-27 00:25:46 +00003646#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003647}
3648
Alexander Belopolsky40018472011-02-26 01:02:56 +00003649PyObject *
3650PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003651 const char *encoding,
3652 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003653{
3654 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003655 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003656
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657 if (!PyUnicode_Check(unicode)) {
3658 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003659 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003660 }
Fred Drakee4315f52000-05-09 19:53:39 +00003661
Victor Stinner22eb6892019-06-26 00:51:05 +02003662 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3663 return NULL;
3664 }
3665
Victor Stinner942889a2016-09-05 15:40:10 -07003666 if (encoding == NULL) {
3667 return _PyUnicode_AsUTF8String(unicode, errors);
3668 }
3669
Fred Drakee4315f52000-05-09 19:53:39 +00003670 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003671 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3672 char *lower = buflower;
3673
3674 /* Fast paths */
3675 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3676 lower += 3;
3677 if (*lower == '_') {
3678 /* Match "utf8" and "utf_8" */
3679 lower++;
3680 }
3681
3682 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003683 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003684 }
3685 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3686 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3687 }
3688 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3689 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3690 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003691 }
Victor Stinner942889a2016-09-05 15:40:10 -07003692 else {
3693 if (strcmp(lower, "ascii") == 0
3694 || strcmp(lower, "us_ascii") == 0) {
3695 return _PyUnicode_AsASCIIString(unicode, errors);
3696 }
Steve Dowercc16be82016-09-08 10:35:16 -07003697#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003698 else if (strcmp(lower, "mbcs") == 0) {
3699 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3700 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003701#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003702 else if (strcmp(lower, "latin1") == 0 ||
3703 strcmp(lower, "latin_1") == 0 ||
3704 strcmp(lower, "iso_8859_1") == 0 ||
3705 strcmp(lower, "iso8859_1") == 0) {
3706 return _PyUnicode_AsLatin1String(unicode, errors);
3707 }
3708 }
Victor Stinner37296e82010-06-10 13:36:23 +00003709 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003710
3711 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003712 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003713 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003714 return NULL;
3715
3716 /* The normal path */
3717 if (PyBytes_Check(v))
3718 return v;
3719
3720 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003721 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003722 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003723 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003724
3725 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003726 "encoder %s returned bytearray instead of bytes; "
3727 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003728 encoding);
3729 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003730 Py_DECREF(v);
3731 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003732 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003733
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003734 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3735 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003736 Py_DECREF(v);
3737 return b;
3738 }
3739
3740 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003741 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003742 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003743 encoding,
3744 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003745 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003746 return NULL;
3747}
3748
Alexander Belopolsky40018472011-02-26 01:02:56 +00003749PyObject *
3750PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003751 const char *encoding,
3752 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003753{
3754 PyObject *v;
3755
3756 if (!PyUnicode_Check(unicode)) {
3757 PyErr_BadArgument();
3758 goto onError;
3759 }
3760
Serhiy Storchaka00939072016-10-27 21:05:49 +03003761 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3762 "PyUnicode_AsEncodedUnicode() is deprecated; "
3763 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3764 return NULL;
3765
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003766 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003767 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003768
3769 /* Encode via the codec registry */
3770 v = PyCodec_Encode(unicode, encoding, errors);
3771 if (v == NULL)
3772 goto onError;
3773 if (!PyUnicode_Check(v)) {
3774 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003775 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003776 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003777 encoding,
3778 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003779 Py_DECREF(v);
3780 goto onError;
3781 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003782 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003783
Benjamin Peterson29060642009-01-31 22:14:21 +00003784 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003785 return NULL;
3786}
3787
Victor Stinner2cba6b82018-01-10 22:46:15 +01003788static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003789unicode_decode_locale(const char *str, Py_ssize_t len,
3790 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003791{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003792 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3793 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003794 return NULL;
3795 }
3796
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003797 wchar_t *wstr;
3798 size_t wlen;
3799 const char *reason;
3800 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003801 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003802 if (res != 0) {
3803 if (res == -2) {
3804 PyObject *exc;
3805 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3806 "locale", str, len,
3807 (Py_ssize_t)wlen,
3808 (Py_ssize_t)(wlen + 1),
3809 reason);
3810 if (exc != NULL) {
3811 PyCodec_StrictErrors(exc);
3812 Py_DECREF(exc);
3813 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003814 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003815 else if (res == -3) {
3816 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3817 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003818 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003819 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003820 }
Victor Stinner2f197072011-12-17 07:08:30 +01003821 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003822 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003823
3824 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3825 PyMem_RawFree(wstr);
3826 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003827}
3828
3829PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003830PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3831 const char *errors)
3832{
Victor Stinner709d23d2019-05-02 14:56:30 -04003833 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3834 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003835}
3836
3837PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003838PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003839{
3840 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003841 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3842 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003843}
3844
3845
3846PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003847PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003848 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003849 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3850}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003851
Christian Heimes5894ba72007-11-04 11:43:14 +00003852PyObject*
3853PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3854{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003855 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003856#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003857 if (interp->fs_codec.encoding) {
3858 return unicode_decode_utf8(s, size,
3859 interp->fs_codec.error_handler,
3860 interp->fs_codec.errors,
3861 NULL);
3862 }
3863 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003864 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003865 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003866 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003867 assert(errors != _Py_ERROR_UNKNOWN);
3868 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3869 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003870#else
Victor Stinner793b5312011-04-27 00:24:21 +02003871 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3872 cannot use it to encode and decode filenames before it is loaded. Load
3873 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003874 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003875 initialized and the Python codec is loaded.
3876 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003877 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003878 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003879 interp->fs_codec.encoding,
3880 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003881 }
3882 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003883 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003884 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003885 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003886 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003887 }
Victor Stinnerad158722010-10-27 00:25:46 +00003888#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003889}
3890
Martin v. Löwis011e8422009-05-05 04:43:17 +00003891
3892int
3893PyUnicode_FSConverter(PyObject* arg, void* addr)
3894{
Brett Cannonec6ce872016-09-06 15:50:29 -07003895 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003896 PyObject *output = NULL;
3897 Py_ssize_t size;
3898 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003899 if (arg == NULL) {
3900 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003901 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003902 return 1;
3903 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003904 path = PyOS_FSPath(arg);
3905 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003906 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003907 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003908 if (PyBytes_Check(path)) {
3909 output = path;
3910 }
3911 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3912 output = PyUnicode_EncodeFSDefault(path);
3913 Py_DECREF(path);
3914 if (!output) {
3915 return 0;
3916 }
3917 assert(PyBytes_Check(output));
3918 }
3919
Victor Stinner0ea2a462010-04-30 00:22:08 +00003920 size = PyBytes_GET_SIZE(output);
3921 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003922 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003923 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003924 Py_DECREF(output);
3925 return 0;
3926 }
3927 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003928 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003929}
3930
3931
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003932int
3933PyUnicode_FSDecoder(PyObject* arg, void* addr)
3934{
Brett Cannona5711202016-09-06 19:36:01 -07003935 int is_buffer = 0;
3936 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003937 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003938 if (arg == NULL) {
3939 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003940 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003941 return 1;
3942 }
Brett Cannona5711202016-09-06 19:36:01 -07003943
3944 is_buffer = PyObject_CheckBuffer(arg);
3945 if (!is_buffer) {
3946 path = PyOS_FSPath(arg);
3947 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003948 return 0;
3949 }
Brett Cannona5711202016-09-06 19:36:01 -07003950 }
3951 else {
3952 path = arg;
3953 Py_INCREF(arg);
3954 }
3955
3956 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003957 output = path;
3958 }
3959 else if (PyBytes_Check(path) || is_buffer) {
3960 PyObject *path_bytes = NULL;
3961
3962 if (!PyBytes_Check(path) &&
3963 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003964 "path should be string, bytes, or os.PathLike, not %.200s",
3965 Py_TYPE(arg)->tp_name)) {
3966 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003967 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003968 }
3969 path_bytes = PyBytes_FromObject(path);
3970 Py_DECREF(path);
3971 if (!path_bytes) {
3972 return 0;
3973 }
3974 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3975 PyBytes_GET_SIZE(path_bytes));
3976 Py_DECREF(path_bytes);
3977 if (!output) {
3978 return 0;
3979 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003980 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003981 else {
3982 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003983 "path should be string, bytes, or os.PathLike, not %.200s",
3984 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003985 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003986 return 0;
3987 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003988 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003989 Py_DECREF(output);
3990 return 0;
3991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003993 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003994 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003995 Py_DECREF(output);
3996 return 0;
3997 }
3998 *(PyObject**)addr = output;
3999 return Py_CLEANUP_SUPPORTED;
4000}
4001
4002
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004003const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004004PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004005{
Christian Heimesf3863112007-11-22 07:46:41 +00004006 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004007
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004008 if (!PyUnicode_Check(unicode)) {
4009 PyErr_BadArgument();
4010 return NULL;
4011 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004012 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004013 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004014
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004015 if (PyUnicode_UTF8(unicode) == NULL) {
4016 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004017 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 if (bytes == NULL)
4019 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004020 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4021 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004022 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023 Py_DECREF(bytes);
4024 return NULL;
4025 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004026 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004027 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004028 PyBytes_AS_STRING(bytes),
4029 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 Py_DECREF(bytes);
4031 }
4032
4033 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004034 *psize = PyUnicode_UTF8_LENGTH(unicode);
4035 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004036}
4037
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004038const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004039PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004040{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4042}
4043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004044Py_UNICODE *
4045PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4046{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047 if (!PyUnicode_Check(unicode)) {
4048 PyErr_BadArgument();
4049 return NULL;
4050 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004051 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4052 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004053 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004054 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004055 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056
Serhiy Storchakac46db922018-10-23 22:58:24 +03004057 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4058 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4059 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004061 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004062 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4063 if (w == NULL) {
4064 PyErr_NoMemory();
4065 return NULL;
4066 }
4067 unicode_copy_as_widechar(unicode, w, wlen + 1);
4068 _PyUnicode_WSTR(unicode) = w;
4069 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4070 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004071 }
4072 }
4073 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004074 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004075 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004076}
4077
Alexander Belopolsky40018472011-02-26 01:02:56 +00004078Py_UNICODE *
4079PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004081 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004082}
4083
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004084const Py_UNICODE *
4085_PyUnicode_AsUnicode(PyObject *unicode)
4086{
4087 Py_ssize_t size;
4088 const Py_UNICODE *wstr;
4089
4090 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4091 if (wstr && wcslen(wstr) != (size_t)size) {
4092 PyErr_SetString(PyExc_ValueError, "embedded null character");
4093 return NULL;
4094 }
4095 return wstr;
4096}
4097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004098
Alexander Belopolsky40018472011-02-26 01:02:56 +00004099Py_ssize_t
4100PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004101{
4102 if (!PyUnicode_Check(unicode)) {
4103 PyErr_BadArgument();
4104 goto onError;
4105 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004106 if (_PyUnicode_WSTR(unicode) == NULL) {
4107 if (PyUnicode_AsUnicode(unicode) == NULL)
4108 goto onError;
4109 }
4110 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004111
Benjamin Peterson29060642009-01-31 22:14:21 +00004112 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004113 return -1;
4114}
4115
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004116Py_ssize_t
4117PyUnicode_GetLength(PyObject *unicode)
4118{
Victor Stinner07621332012-06-16 04:53:46 +02004119 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 PyErr_BadArgument();
4121 return -1;
4122 }
Victor Stinner07621332012-06-16 04:53:46 +02004123 if (PyUnicode_READY(unicode) == -1)
4124 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125 return PyUnicode_GET_LENGTH(unicode);
4126}
4127
4128Py_UCS4
4129PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4130{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004131 void *data;
4132 int kind;
4133
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004134 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004135 PyErr_BadArgument();
4136 return (Py_UCS4)-1;
4137 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004138 if (PyUnicode_READY(unicode) == -1) {
4139 return (Py_UCS4)-1;
4140 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004141 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004142 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004143 return (Py_UCS4)-1;
4144 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004145 data = PyUnicode_DATA(unicode);
4146 kind = PyUnicode_KIND(unicode);
4147 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004148}
4149
4150int
4151PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4152{
4153 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004154 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004155 return -1;
4156 }
Victor Stinner488fa492011-12-12 00:01:39 +01004157 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004158 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004159 PyErr_SetString(PyExc_IndexError, "string index out of range");
4160 return -1;
4161 }
Victor Stinner488fa492011-12-12 00:01:39 +01004162 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004163 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004164 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4165 PyErr_SetString(PyExc_ValueError, "character out of range");
4166 return -1;
4167 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004168 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4169 index, ch);
4170 return 0;
4171}
4172
Alexander Belopolsky40018472011-02-26 01:02:56 +00004173const char *
4174PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004175{
Victor Stinner42cb4622010-09-01 19:39:01 +00004176 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004177}
4178
Victor Stinner554f3f02010-06-16 23:33:54 +00004179/* create or adjust a UnicodeDecodeError */
4180static void
4181make_decode_exception(PyObject **exceptionObject,
4182 const char *encoding,
4183 const char *input, Py_ssize_t length,
4184 Py_ssize_t startpos, Py_ssize_t endpos,
4185 const char *reason)
4186{
4187 if (*exceptionObject == NULL) {
4188 *exceptionObject = PyUnicodeDecodeError_Create(
4189 encoding, input, length, startpos, endpos, reason);
4190 }
4191 else {
4192 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4193 goto onError;
4194 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4195 goto onError;
4196 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4197 goto onError;
4198 }
4199 return;
4200
4201onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004202 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004203}
4204
Steve Dowercc16be82016-09-08 10:35:16 -07004205#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004206static int
4207widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4208{
4209 if (newsize > *size) {
4210 wchar_t *newbuf = *buf;
4211 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4212 PyErr_NoMemory();
4213 return -1;
4214 }
4215 *buf = newbuf;
4216 }
4217 *size = newsize;
4218 return 0;
4219}
4220
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004221/* error handling callback helper:
4222 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004223 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224 and adjust various state variables.
4225 return 0 on success, -1 on error
4226*/
4227
Alexander Belopolsky40018472011-02-26 01:02:56 +00004228static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004229unicode_decode_call_errorhandler_wchar(
4230 const char *errors, PyObject **errorHandler,
4231 const char *encoding, const char *reason,
4232 const char **input, const char **inend, Py_ssize_t *startinpos,
4233 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004234 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004235{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004236 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004237
4238 PyObject *restuple = NULL;
4239 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004240 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004241 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004242 Py_ssize_t requiredsize;
4243 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004244 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245 wchar_t *repwstr;
4246 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004247
4248 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004249 *errorHandler = PyCodec_LookupError(errors);
4250 if (*errorHandler == NULL)
4251 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004252 }
4253
Victor Stinner554f3f02010-06-16 23:33:54 +00004254 make_decode_exception(exceptionObject,
4255 encoding,
4256 *input, *inend - *input,
4257 *startinpos, *endinpos,
4258 reason);
4259 if (*exceptionObject == NULL)
4260 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004261
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004262 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004263 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004264 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004266 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004267 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004268 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004269 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004270 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004271
4272 /* Copy back the bytes variables, which might have been modified by the
4273 callback */
4274 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4275 if (!inputobj)
4276 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004277 *input = PyBytes_AS_STRING(inputobj);
4278 insize = PyBytes_GET_SIZE(inputobj);
4279 *inend = *input + insize;
4280 /* we can DECREF safely, as the exception has another reference,
4281 so the object won't go away. */
4282 Py_DECREF(inputobj);
4283
4284 if (newpos<0)
4285 newpos = insize+newpos;
4286 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004287 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004288 goto onError;
4289 }
4290
4291 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4292 if (repwstr == NULL)
4293 goto onError;
4294 /* need more space? (at least enough for what we
4295 have+the replacement+the rest of the string (starting
4296 at the new input position), so we won't have to check space
4297 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004298 requiredsize = *outpos;
4299 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4300 goto overflow;
4301 requiredsize += repwlen;
4302 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4303 goto overflow;
4304 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004305 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004306 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004307 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004309 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004311 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004313 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004315 *endinpos = newpos;
4316 *inptr = *input + newpos;
4317
4318 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004319 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004320 return 0;
4321
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004322 overflow:
4323 PyErr_SetString(PyExc_OverflowError,
4324 "decoded result is too long for a Python string");
4325
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004326 onError:
4327 Py_XDECREF(restuple);
4328 return -1;
4329}
Steve Dowercc16be82016-09-08 10:35:16 -07004330#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004331
4332static int
4333unicode_decode_call_errorhandler_writer(
4334 const char *errors, PyObject **errorHandler,
4335 const char *encoding, const char *reason,
4336 const char **input, const char **inend, Py_ssize_t *startinpos,
4337 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4338 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4339{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004340 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004341
4342 PyObject *restuple = NULL;
4343 PyObject *repunicode = NULL;
4344 Py_ssize_t insize;
4345 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004346 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004347 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004348 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004349 int need_to_grow = 0;
4350 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004351
4352 if (*errorHandler == NULL) {
4353 *errorHandler = PyCodec_LookupError(errors);
4354 if (*errorHandler == NULL)
4355 goto onError;
4356 }
4357
4358 make_decode_exception(exceptionObject,
4359 encoding,
4360 *input, *inend - *input,
4361 *startinpos, *endinpos,
4362 reason);
4363 if (*exceptionObject == NULL)
4364 goto onError;
4365
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004366 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004367 if (restuple == NULL)
4368 goto onError;
4369 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004370 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 goto onError;
4372 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004373 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004374 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004375
4376 /* Copy back the bytes variables, which might have been modified by the
4377 callback */
4378 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4379 if (!inputobj)
4380 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004381 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004382 *input = PyBytes_AS_STRING(inputobj);
4383 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004384 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004385 /* we can DECREF safely, as the exception has another reference,
4386 so the object won't go away. */
4387 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004388
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004389 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004390 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004391 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004392 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004393 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004394 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004395
Victor Stinner170ca6f2013-04-18 00:25:28 +02004396 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004397 if (replen > 1) {
4398 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004399 need_to_grow = 1;
4400 }
4401 new_inptr = *input + newpos;
4402 if (*inend - new_inptr > remain) {
4403 /* We don't know the decoding algorithm here so we make the worst
4404 assumption that one byte decodes to one unicode character.
4405 If unfortunately one byte could decode to more unicode characters,
4406 the decoder may write out-of-bound then. Is it possible for the
4407 algorithms using this function? */
4408 writer->min_length += *inend - new_inptr - remain;
4409 need_to_grow = 1;
4410 }
4411 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004412 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004413 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004414 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4415 goto onError;
4416 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004417 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004418 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004419
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004420 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004421 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004422
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004423 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004424 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004425 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004426
Benjamin Peterson29060642009-01-31 22:14:21 +00004427 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004428 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430}
4431
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004432/* --- UTF-7 Codec -------------------------------------------------------- */
4433
Antoine Pitrou244651a2009-05-04 18:56:13 +00004434/* See RFC2152 for details. We encode conservatively and decode liberally. */
4435
4436/* Three simple macros defining base-64. */
4437
4438/* Is c a base-64 character? */
4439
4440#define IS_BASE64(c) \
4441 (((c) >= 'A' && (c) <= 'Z') || \
4442 ((c) >= 'a' && (c) <= 'z') || \
4443 ((c) >= '0' && (c) <= '9') || \
4444 (c) == '+' || (c) == '/')
4445
4446/* given that c is a base-64 character, what is its base-64 value? */
4447
4448#define FROM_BASE64(c) \
4449 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4450 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4451 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4452 (c) == '+' ? 62 : 63)
4453
4454/* What is the base-64 character of the bottom 6 bits of n? */
4455
4456#define TO_BASE64(n) \
4457 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4458
4459/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4460 * decoded as itself. We are permissive on decoding; the only ASCII
4461 * byte not decoding to itself is the + which begins a base64
4462 * string. */
4463
4464#define DECODE_DIRECT(c) \
4465 ((c) <= 127 && (c) != '+')
4466
4467/* The UTF-7 encoder treats ASCII characters differently according to
4468 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4469 * the above). See RFC2152. This array identifies these different
4470 * sets:
4471 * 0 : "Set D"
4472 * alphanumeric and '(),-./:?
4473 * 1 : "Set O"
4474 * !"#$%&*;<=>@[]^_`{|}
4475 * 2 : "whitespace"
4476 * ht nl cr sp
4477 * 3 : special (must be base64 encoded)
4478 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4479 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004480
Tim Petersced69f82003-09-16 20:30:58 +00004481static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004482char utf7_category[128] = {
4483/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4484 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4485/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4486 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4487/* sp ! " # $ % & ' ( ) * + , - . / */
4488 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4489/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4491/* @ A B C D E F G H I J K L M N O */
4492 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4493/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4495/* ` a b c d e f g h i j k l m n o */
4496 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4497/* p q r s t u v w x y z { | } ~ del */
4498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004499};
4500
Antoine Pitrou244651a2009-05-04 18:56:13 +00004501/* ENCODE_DIRECT: this character should be encoded as itself. The
4502 * answer depends on whether we are encoding set O as itself, and also
4503 * on whether we are encoding whitespace as itself. RFC2152 makes it
4504 * clear that the answers to these questions vary between
4505 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004506
Antoine Pitrou244651a2009-05-04 18:56:13 +00004507#define ENCODE_DIRECT(c, directO, directWS) \
4508 ((c) < 128 && (c) > 0 && \
4509 ((utf7_category[(c)] == 0) || \
4510 (directWS && (utf7_category[(c)] == 2)) || \
4511 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004512
Alexander Belopolsky40018472011-02-26 01:02:56 +00004513PyObject *
4514PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004515 Py_ssize_t size,
4516 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004517{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004518 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4519}
4520
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521/* The decoder. The only state we preserve is our read position,
4522 * i.e. how many characters we have consumed. So if we end in the
4523 * middle of a shift sequence we have to back off the read position
4524 * and the output to the beginning of the sequence, otherwise we lose
4525 * all the shift state (seen bits, number of bits seen, high
4526 * surrogate). */
4527
Alexander Belopolsky40018472011-02-26 01:02:56 +00004528PyObject *
4529PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004530 Py_ssize_t size,
4531 const char *errors,
4532 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004533{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004534 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004535 Py_ssize_t startinpos;
4536 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004537 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004538 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004539 const char *errmsg = "";
4540 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004541 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 unsigned int base64bits = 0;
4543 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004544 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004545 PyObject *errorHandler = NULL;
4546 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004547
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004548 if (size == 0) {
4549 if (consumed)
4550 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004551 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004552 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004553
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004554 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004555 _PyUnicodeWriter_Init(&writer);
4556 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004557
4558 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004559 e = s + size;
4560
4561 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004562 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004563 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004564 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004565
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 if (inShift) { /* in a base-64 section */
4567 if (IS_BASE64(ch)) { /* consume a base-64 character */
4568 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4569 base64bits += 6;
4570 s++;
4571 if (base64bits >= 16) {
4572 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004573 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits -= 16;
4575 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004576 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004577 if (surrogate) {
4578 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004579 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4580 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004581 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004582 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004584 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 }
4586 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004587 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004588 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590 }
4591 }
Victor Stinner551ac952011-11-29 22:58:13 +01004592 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 /* first surrogate */
4594 surrogate = outCh;
4595 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004597 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004598 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004599 }
4600 }
4601 }
4602 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004603 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004604 if (base64bits > 0) { /* left-over bits */
4605 if (base64bits >= 6) {
4606 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004607 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 errmsg = "partial character in shift sequence";
4609 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004610 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 else {
4612 /* Some bits remain; they should be zero */
4613 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004614 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 errmsg = "non-zero padding bits in shift sequence";
4616 goto utf7Error;
4617 }
4618 }
4619 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004620 if (surrogate && DECODE_DIRECT(ch)) {
4621 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4622 goto onError;
4623 }
4624 surrogate = 0;
4625 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 /* '-' is absorbed; other terminating
4627 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004628 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004629 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004630 }
4631 }
4632 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004633 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004634 s++; /* consume '+' */
4635 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004636 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004637 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004638 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004640 else if (s < e && !IS_BASE64(*s)) {
4641 s++;
4642 errmsg = "ill-formed sequence";
4643 goto utf7Error;
4644 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004645 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004646 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004647 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004648 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004650 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004651 }
4652 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004654 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004655 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004656 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004657 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004658 else {
4659 startinpos = s-starts;
4660 s++;
4661 errmsg = "unexpected special character";
4662 goto utf7Error;
4663 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004664 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004665utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004666 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004667 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004668 errors, &errorHandler,
4669 "utf7", errmsg,
4670 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004671 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004673 }
4674
Antoine Pitrou244651a2009-05-04 18:56:13 +00004675 /* end of string */
4676
4677 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4678 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004679 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004680 if (surrogate ||
4681 (base64bits >= 6) ||
4682 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004683 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004684 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 errors, &errorHandler,
4686 "utf7", "unterminated shift sequence",
4687 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004688 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689 goto onError;
4690 if (s < e)
4691 goto restart;
4692 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694
4695 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004696 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004698 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004699 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004700 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004701 writer.kind, writer.data, shiftOutStart);
4702 Py_XDECREF(errorHandler);
4703 Py_XDECREF(exc);
4704 _PyUnicodeWriter_Dealloc(&writer);
4705 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004706 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004707 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 }
4709 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004710 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004711 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004712 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004714 Py_XDECREF(errorHandler);
4715 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004716 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717
Benjamin Peterson29060642009-01-31 22:14:21 +00004718 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004719 Py_XDECREF(errorHandler);
4720 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004721 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004722 return NULL;
4723}
4724
4725
Alexander Belopolsky40018472011-02-26 01:02:56 +00004726PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004727_PyUnicode_EncodeUTF7(PyObject *str,
4728 int base64SetO,
4729 int base64WhiteSpace,
4730 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004731{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004732 int kind;
4733 void *data;
4734 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004735 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004736 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004737 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004738 unsigned int base64bits = 0;
4739 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740 char * out;
4741 char * start;
4742
Benjamin Petersonbac79492012-01-14 13:34:47 -05004743 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004744 return NULL;
4745 kind = PyUnicode_KIND(str);
4746 data = PyUnicode_DATA(str);
4747 len = PyUnicode_GET_LENGTH(str);
4748
4749 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004750 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004752 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004753 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004754 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004755 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004756 if (v == NULL)
4757 return NULL;
4758
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004759 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004760 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004761 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004762
Antoine Pitrou244651a2009-05-04 18:56:13 +00004763 if (inShift) {
4764 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4765 /* shifting out */
4766 if (base64bits) { /* output remaining bits */
4767 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4768 base64buffer = 0;
4769 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004770 }
4771 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004772 /* Characters not in the BASE64 set implicitly unshift the sequence
4773 so no '-' is required, except if the character is itself a '-' */
4774 if (IS_BASE64(ch) || ch == '-') {
4775 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004776 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004777 *out++ = (char) ch;
4778 }
4779 else {
4780 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004781 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004782 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004783 else { /* not in a shift sequence */
4784 if (ch == '+') {
4785 *out++ = '+';
4786 *out++ = '-';
4787 }
4788 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4789 *out++ = (char) ch;
4790 }
4791 else {
4792 *out++ = '+';
4793 inShift = 1;
4794 goto encode_char;
4795 }
4796 }
4797 continue;
4798encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004799 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004800 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004801
Antoine Pitrou244651a2009-05-04 18:56:13 +00004802 /* code first surrogate */
4803 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004804 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004805 while (base64bits >= 6) {
4806 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4807 base64bits -= 6;
4808 }
4809 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004810 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004811 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004812 base64bits += 16;
4813 base64buffer = (base64buffer << 16) | ch;
4814 while (base64bits >= 6) {
4815 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4816 base64bits -= 6;
4817 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004818 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004819 if (base64bits)
4820 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4821 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004822 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004823 if (_PyBytes_Resize(&v, out - start) < 0)
4824 return NULL;
4825 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004826}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004827PyObject *
4828PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4829 Py_ssize_t size,
4830 int base64SetO,
4831 int base64WhiteSpace,
4832 const char *errors)
4833{
4834 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004835 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004836 if (tmp == NULL)
4837 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004838 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004839 base64WhiteSpace, errors);
4840 Py_DECREF(tmp);
4841 return result;
4842}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004843
Antoine Pitrou244651a2009-05-04 18:56:13 +00004844#undef IS_BASE64
4845#undef FROM_BASE64
4846#undef TO_BASE64
4847#undef DECODE_DIRECT
4848#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004849
Guido van Rossumd57fd912000-03-10 22:53:23 +00004850/* --- UTF-8 Codec -------------------------------------------------------- */
4851
Alexander Belopolsky40018472011-02-26 01:02:56 +00004852PyObject *
4853PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004854 Py_ssize_t size,
4855 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004856{
Walter Dörwald69652032004-09-07 20:24:22 +00004857 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4858}
4859
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004860#include "stringlib/asciilib.h"
4861#include "stringlib/codecs.h"
4862#include "stringlib/undef.h"
4863
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004864#include "stringlib/ucs1lib.h"
4865#include "stringlib/codecs.h"
4866#include "stringlib/undef.h"
4867
4868#include "stringlib/ucs2lib.h"
4869#include "stringlib/codecs.h"
4870#include "stringlib/undef.h"
4871
4872#include "stringlib/ucs4lib.h"
4873#include "stringlib/codecs.h"
4874#include "stringlib/undef.h"
4875
Antoine Pitrouab868312009-01-10 15:40:25 +00004876/* Mask to quickly check whether a C 'long' contains a
4877 non-ASCII, UTF8-encoded char. */
4878#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004879# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004880#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004881# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004882#else
4883# error C 'long' size should be either 4 or 8!
4884#endif
4885
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004886static Py_ssize_t
4887ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004888{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004890 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004891
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004892 /*
4893 * Issue #17237: m68k is a bit different from most architectures in
4894 * that objects do not use "natural alignment" - for example, int and
4895 * long are only aligned at 2-byte boundaries. Therefore the assert()
4896 * won't work; also, tests have shown that skipping the "optimised
4897 * version" will even speed up m68k.
4898 */
4899#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004900#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004901 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4902 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 /* Fast path, see in STRINGLIB(utf8_decode) for
4904 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004905 /* Help allocation */
4906 const char *_p = p;
4907 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004908 while (_p < aligned_end) {
4909 unsigned long value = *(const unsigned long *) _p;
4910 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004911 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 *((unsigned long *)q) = value;
4913 _p += SIZEOF_LONG;
4914 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004915 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 p = _p;
4917 while (p < end) {
4918 if ((unsigned char)*p & 0x80)
4919 break;
4920 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004921 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004922 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004923 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004925#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 while (p < end) {
4927 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4928 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004929 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004930 /* Help allocation */
4931 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004932 while (_p < aligned_end) {
4933 unsigned long value = *(unsigned long *) _p;
4934 if (value & ASCII_CHAR_MASK)
4935 break;
4936 _p += SIZEOF_LONG;
4937 }
4938 p = _p;
4939 if (_p == end)
4940 break;
4941 }
4942 if ((unsigned char)*p & 0x80)
4943 break;
4944 ++p;
4945 }
4946 memcpy(dest, start, p - start);
4947 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004948}
Antoine Pitrouab868312009-01-10 15:40:25 +00004949
Victor Stinner709d23d2019-05-02 14:56:30 -04004950static PyObject *
4951unicode_decode_utf8(const char *s, Py_ssize_t size,
4952 _Py_error_handler error_handler, const char *errors,
4953 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004954{
Victor Stinner785938e2011-12-11 20:09:03 +01004955 if (size == 0) {
4956 if (consumed)
4957 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004958 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004959 }
4960
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4962 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004963 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004964 *consumed = 1;
4965 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004966 }
4967
Inada Naoki770847a2019-06-24 12:30:24 +09004968 const char *starts = s;
4969 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004970
Inada Naoki770847a2019-06-24 12:30:24 +09004971 // fast path: try ASCII string.
4972 PyObject *u = PyUnicode_New(size, 127);
4973 if (u == NULL) {
4974 return NULL;
4975 }
4976 s += ascii_decode(s, end, PyUnicode_DATA(u));
4977 if (s == end) {
4978 return u;
4979 }
4980
4981 // Use _PyUnicodeWriter after fast path is failed.
4982 _PyUnicodeWriter writer;
4983 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4984 writer.pos = s - starts;
4985
4986 Py_ssize_t startinpos, endinpos;
4987 const char *errmsg = "";
4988 PyObject *error_handler_obj = NULL;
4989 PyObject *exc = NULL;
4990
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 while (s < end) {
4992 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004993 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004994
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004996 if (PyUnicode_IS_ASCII(writer.buffer))
4997 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004998 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004999 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005001 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002 } else {
5003 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005004 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 }
5006
5007 switch (ch) {
5008 case 0:
5009 if (s == end || consumed)
5010 goto End;
5011 errmsg = "unexpected end of data";
5012 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005013 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005014 break;
5015 case 1:
5016 errmsg = "invalid start byte";
5017 startinpos = s - starts;
5018 endinpos = startinpos + 1;
5019 break;
5020 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005021 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5022 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5023 {
5024 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005025 goto End;
5026 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005027 /* fall through */
5028 case 3:
5029 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005030 errmsg = "invalid continuation byte";
5031 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005032 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005033 break;
5034 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005035 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005036 goto onError;
5037 continue;
5038 }
5039
Victor Stinner1d65d912015-10-05 13:43:50 +02005040 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005041 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005042
5043 switch (error_handler) {
5044 case _Py_ERROR_IGNORE:
5045 s += (endinpos - startinpos);
5046 break;
5047
5048 case _Py_ERROR_REPLACE:
5049 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5050 goto onError;
5051 s += (endinpos - startinpos);
5052 break;
5053
5054 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005055 {
5056 Py_ssize_t i;
5057
Victor Stinner1d65d912015-10-05 13:43:50 +02005058 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5059 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005060 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005061 ch = (Py_UCS4)(unsigned char)(starts[i]);
5062 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5063 ch + 0xdc00);
5064 writer.pos++;
5065 }
5066 s += (endinpos - startinpos);
5067 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005068 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005069
5070 default:
5071 if (unicode_decode_call_errorhandler_writer(
5072 errors, &error_handler_obj,
5073 "utf-8", errmsg,
5074 &starts, &end, &startinpos, &endinpos, &exc, &s,
5075 &writer))
5076 goto onError;
5077 }
Victor Stinner785938e2011-12-11 20:09:03 +01005078 }
5079
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005080End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 if (consumed)
5082 *consumed = s - starts;
5083
Victor Stinner1d65d912015-10-05 13:43:50 +02005084 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005086 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087
5088onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005089 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005090 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005091 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005093}
5094
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005095
Victor Stinner709d23d2019-05-02 14:56:30 -04005096PyObject *
5097PyUnicode_DecodeUTF8Stateful(const char *s,
5098 Py_ssize_t size,
5099 const char *errors,
5100 Py_ssize_t *consumed)
5101{
5102 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5103}
5104
5105
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005106/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5107 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005108
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005109 On success, write a pointer to a newly allocated wide character string into
5110 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5111 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005112
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005113 On memory allocation failure, return -1.
5114
5115 On decoding error (if surrogateescape is zero), return -2. If wlen is
5116 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5117 is not NULL, write the decoding error message into *reason. */
5118int
5119_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005120 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005121{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005122 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005123 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005124 wchar_t *unicode;
5125 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005126
Victor Stinner3d4226a2018-08-29 22:21:32 +02005127 int surrogateescape = 0;
5128 int surrogatepass = 0;
5129 switch (errors)
5130 {
5131 case _Py_ERROR_STRICT:
5132 break;
5133 case _Py_ERROR_SURROGATEESCAPE:
5134 surrogateescape = 1;
5135 break;
5136 case _Py_ERROR_SURROGATEPASS:
5137 surrogatepass = 1;
5138 break;
5139 default:
5140 return -3;
5141 }
5142
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005143 /* Note: size will always be longer than the resulting Unicode
5144 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005145 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005146 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005147 }
5148
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005149 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005150 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005151 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005152 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005153
5154 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005155 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005156 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005157 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005158 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005159#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005160 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005161#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005162 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005163#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005164 if (ch > 0xFF) {
5165#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005166 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005167#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005168 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005169 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005170 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5171 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5172#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005173 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005175 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005176 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005177 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005178
5179 if (surrogateescape) {
5180 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5181 }
5182 else {
5183 /* Is it a valid three-byte code? */
5184 if (surrogatepass
5185 && (e - s) >= 3
5186 && (s[0] & 0xf0) == 0xe0
5187 && (s[1] & 0xc0) == 0x80
5188 && (s[2] & 0xc0) == 0x80)
5189 {
5190 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5191 s += 3;
5192 unicode[outpos++] = ch;
5193 }
5194 else {
5195 PyMem_RawFree(unicode );
5196 if (reason != NULL) {
5197 switch (ch) {
5198 case 0:
5199 *reason = "unexpected end of data";
5200 break;
5201 case 1:
5202 *reason = "invalid start byte";
5203 break;
5204 /* 2, 3, 4 */
5205 default:
5206 *reason = "invalid continuation byte";
5207 break;
5208 }
5209 }
5210 if (wlen != NULL) {
5211 *wlen = s - orig_s;
5212 }
5213 return -2;
5214 }
5215 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005216 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005217 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005218 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005219 if (wlen) {
5220 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005221 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005222 *wstr = unicode;
5223 return 0;
5224}
5225
Victor Stinner5f9cf232019-03-19 01:46:25 +01005226
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005227wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005228_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5229 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005230{
5231 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005232 int res = _Py_DecodeUTF8Ex(arg, arglen,
5233 &wstr, wlen,
5234 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005235 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005236 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5237 assert(res != -3);
5238 if (wlen) {
5239 *wlen = (size_t)res;
5240 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005241 return NULL;
5242 }
5243 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005244}
5245
Antoine Pitrouab868312009-01-10 15:40:25 +00005246
Victor Stinnere47e6982017-12-21 15:45:16 +01005247/* UTF-8 encoder using the surrogateescape error handler .
5248
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005249 On success, return 0 and write the newly allocated character string (use
5250 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005251
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005252 On encoding failure, return -2 and write the position of the invalid
5253 surrogate character into *error_pos (if error_pos is set) and the decoding
5254 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005255
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005256 On memory allocation failure, return -1. */
5257int
5258_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005259 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005260{
5261 const Py_ssize_t max_char_size = 4;
5262 Py_ssize_t len = wcslen(text);
5263
5264 assert(len >= 0);
5265
Victor Stinner3d4226a2018-08-29 22:21:32 +02005266 int surrogateescape = 0;
5267 int surrogatepass = 0;
5268 switch (errors)
5269 {
5270 case _Py_ERROR_STRICT:
5271 break;
5272 case _Py_ERROR_SURROGATEESCAPE:
5273 surrogateescape = 1;
5274 break;
5275 case _Py_ERROR_SURROGATEPASS:
5276 surrogatepass = 1;
5277 break;
5278 default:
5279 return -3;
5280 }
5281
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005282 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5283 return -1;
5284 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005285 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005286 if (raw_malloc) {
5287 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005288 }
5289 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005290 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005291 }
5292 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005293 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005294 }
5295
5296 char *p = bytes;
5297 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005298 for (i = 0; i < len; ) {
5299 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005300 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005301 i++;
5302#if Py_UNICODE_SIZE == 2
5303 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5304 && i < len
5305 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5306 {
5307 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5308 i++;
5309 }
5310#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005311
5312 if (ch < 0x80) {
5313 /* Encode ASCII */
5314 *p++ = (char) ch;
5315
5316 }
5317 else if (ch < 0x0800) {
5318 /* Encode Latin-1 */
5319 *p++ = (char)(0xc0 | (ch >> 6));
5320 *p++ = (char)(0x80 | (ch & 0x3f));
5321 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005322 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005323 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005324 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005325 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005326 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005327 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005328 if (reason != NULL) {
5329 *reason = "encoding error";
5330 }
5331 if (raw_malloc) {
5332 PyMem_RawFree(bytes);
5333 }
5334 else {
5335 PyMem_Free(bytes);
5336 }
5337 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005338 }
5339 *p++ = (char)(ch & 0xff);
5340 }
5341 else if (ch < 0x10000) {
5342 *p++ = (char)(0xe0 | (ch >> 12));
5343 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5344 *p++ = (char)(0x80 | (ch & 0x3f));
5345 }
5346 else { /* ch >= 0x10000 */
5347 assert(ch <= MAX_UNICODE);
5348 /* Encode UCS4 Unicode ordinals */
5349 *p++ = (char)(0xf0 | (ch >> 18));
5350 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5351 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5352 *p++ = (char)(0x80 | (ch & 0x3f));
5353 }
5354 }
5355 *p++ = '\0';
5356
5357 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005358 char *bytes2;
5359 if (raw_malloc) {
5360 bytes2 = PyMem_RawRealloc(bytes, final_size);
5361 }
5362 else {
5363 bytes2 = PyMem_Realloc(bytes, final_size);
5364 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005365 if (bytes2 == NULL) {
5366 if (error_pos != NULL) {
5367 *error_pos = (size_t)-1;
5368 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005369 if (raw_malloc) {
5370 PyMem_RawFree(bytes);
5371 }
5372 else {
5373 PyMem_Free(bytes);
5374 }
5375 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005376 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005377 *str = bytes2;
5378 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005379}
5380
5381
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005382/* Primary internal function which creates utf8 encoded bytes objects.
5383
5384 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005385 and allocate exactly as much space needed at the end. Else allocate the
5386 maximum possible needed (4 result bytes per Unicode character), and return
5387 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005388*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005389static PyObject *
5390unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5391 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005392{
Victor Stinner6099a032011-12-18 14:22:26 +01005393 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005394 void *data;
5395 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005397 if (!PyUnicode_Check(unicode)) {
5398 PyErr_BadArgument();
5399 return NULL;
5400 }
5401
5402 if (PyUnicode_READY(unicode) == -1)
5403 return NULL;
5404
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005405 if (PyUnicode_UTF8(unicode))
5406 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5407 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005408
5409 kind = PyUnicode_KIND(unicode);
5410 data = PyUnicode_DATA(unicode);
5411 size = PyUnicode_GET_LENGTH(unicode);
5412
Benjamin Petersonead6b532011-12-20 17:23:42 -06005413 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005414 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005415 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005416 case PyUnicode_1BYTE_KIND:
5417 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5418 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005419 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005420 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005421 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005422 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005423 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005424 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005425}
5426
Alexander Belopolsky40018472011-02-26 01:02:56 +00005427PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005428_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5429{
5430 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5431}
5432
5433
5434PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005435PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5436 Py_ssize_t size,
5437 const char *errors)
5438{
5439 PyObject *v, *unicode;
5440
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005441 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005442 if (unicode == NULL)
5443 return NULL;
5444 v = _PyUnicode_AsUTF8String(unicode, errors);
5445 Py_DECREF(unicode);
5446 return v;
5447}
5448
5449PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005450PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005451{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005452 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005453}
5454
Walter Dörwald41980ca2007-08-16 21:55:45 +00005455/* --- UTF-32 Codec ------------------------------------------------------- */
5456
5457PyObject *
5458PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005459 Py_ssize_t size,
5460 const char *errors,
5461 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005462{
5463 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5464}
5465
5466PyObject *
5467PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005468 Py_ssize_t size,
5469 const char *errors,
5470 int *byteorder,
5471 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005472{
5473 const char *starts = s;
5474 Py_ssize_t startinpos;
5475 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005476 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005477 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005478 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005480 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005481 PyObject *errorHandler = NULL;
5482 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005483
Walter Dörwald41980ca2007-08-16 21:55:45 +00005484 q = (unsigned char *)s;
5485 e = q + size;
5486
5487 if (byteorder)
5488 bo = *byteorder;
5489
5490 /* Check for BOM marks (U+FEFF) in the input and adjust current
5491 byte order setting accordingly. In native mode, the leading BOM
5492 mark is skipped, in all other modes, it is copied to the output
5493 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005494 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005495 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005496 if (bom == 0x0000FEFF) {
5497 bo = -1;
5498 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005500 else if (bom == 0xFFFE0000) {
5501 bo = 1;
5502 q += 4;
5503 }
5504 if (byteorder)
5505 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005506 }
5507
Victor Stinnere64322e2012-10-30 23:12:47 +01005508 if (q == e) {
5509 if (consumed)
5510 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005511 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005512 }
5513
Victor Stinnere64322e2012-10-30 23:12:47 +01005514#ifdef WORDS_BIGENDIAN
5515 le = bo < 0;
5516#else
5517 le = bo <= 0;
5518#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005519 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005520
Victor Stinner8f674cc2013-04-17 23:02:17 +02005521 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005522 writer.min_length = (e - q + 3) / 4;
5523 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005524 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005525
Victor Stinnere64322e2012-10-30 23:12:47 +01005526 while (1) {
5527 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005528 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005529
Victor Stinnere64322e2012-10-30 23:12:47 +01005530 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005531 enum PyUnicode_Kind kind = writer.kind;
5532 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005533 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005534 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005535 if (le) {
5536 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005537 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005538 if (ch > maxch)
5539 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005540 if (kind != PyUnicode_1BYTE_KIND &&
5541 Py_UNICODE_IS_SURROGATE(ch))
5542 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005543 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005544 q += 4;
5545 } while (q <= last);
5546 }
5547 else {
5548 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005549 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005550 if (ch > maxch)
5551 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005552 if (kind != PyUnicode_1BYTE_KIND &&
5553 Py_UNICODE_IS_SURROGATE(ch))
5554 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005555 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005556 q += 4;
5557 } while (q <= last);
5558 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005559 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005560 }
5561
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005562 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005563 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005564 startinpos = ((const char *)q) - starts;
5565 endinpos = startinpos + 4;
5566 }
5567 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005568 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005569 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005570 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005571 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 startinpos = ((const char *)q) - starts;
5573 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005574 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005575 else {
5576 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005577 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005578 goto onError;
5579 q += 4;
5580 continue;
5581 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005582 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005583 startinpos = ((const char *)q) - starts;
5584 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005585 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005586
5587 /* The remaining input chars are ignored if the callback
5588 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005589 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005590 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005591 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005592 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005593 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005595 }
5596
Walter Dörwald41980ca2007-08-16 21:55:45 +00005597 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005599
Walter Dörwald41980ca2007-08-16 21:55:45 +00005600 Py_XDECREF(errorHandler);
5601 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005602 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005603
Benjamin Peterson29060642009-01-31 22:14:21 +00005604 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005605 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005606 Py_XDECREF(errorHandler);
5607 Py_XDECREF(exc);
5608 return NULL;
5609}
5610
5611PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005612_PyUnicode_EncodeUTF32(PyObject *str,
5613 const char *errors,
5614 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005615{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005616 enum PyUnicode_Kind kind;
5617 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005618 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005619 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005620 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005621#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005622 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005623#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005624 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005625#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005626 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005627 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005628 PyObject *errorHandler = NULL;
5629 PyObject *exc = NULL;
5630 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005631
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005632 if (!PyUnicode_Check(str)) {
5633 PyErr_BadArgument();
5634 return NULL;
5635 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005636 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005637 return NULL;
5638 kind = PyUnicode_KIND(str);
5639 data = PyUnicode_DATA(str);
5640 len = PyUnicode_GET_LENGTH(str);
5641
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005642 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005643 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005644 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005645 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005646 if (v == NULL)
5647 return NULL;
5648
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005649 /* output buffer is 4-bytes aligned */
5650 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005651 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005652 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005653 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005654 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005655 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005656
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005657 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005658 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005659 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005660 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005661 else
5662 encoding = "utf-32";
5663
5664 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005665 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5666 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005667 }
5668
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005669 pos = 0;
5670 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005671 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005672
5673 if (kind == PyUnicode_2BYTE_KIND) {
5674 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5675 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005676 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005677 else {
5678 assert(kind == PyUnicode_4BYTE_KIND);
5679 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5680 &out, native_ordering);
5681 }
5682 if (pos == len)
5683 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005684
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005685 rep = unicode_encode_call_errorhandler(
5686 errors, &errorHandler,
5687 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005688 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005689 if (!rep)
5690 goto error;
5691
5692 if (PyBytes_Check(rep)) {
5693 repsize = PyBytes_GET_SIZE(rep);
5694 if (repsize & 3) {
5695 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005696 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005697 "surrogates not allowed");
5698 goto error;
5699 }
5700 moreunits = repsize / 4;
5701 }
5702 else {
5703 assert(PyUnicode_Check(rep));
5704 if (PyUnicode_READY(rep) < 0)
5705 goto error;
5706 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5707 if (!PyUnicode_IS_ASCII(rep)) {
5708 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005709 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005710 "surrogates not allowed");
5711 goto error;
5712 }
5713 }
5714
5715 /* four bytes are reserved for each surrogate */
5716 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005717 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005718 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005719 /* integer overflow */
5720 PyErr_NoMemory();
5721 goto error;
5722 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005723 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005724 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005725 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 }
5727
5728 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005729 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005730 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005731 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005733 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5734 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005735 }
5736
5737 Py_CLEAR(rep);
5738 }
5739
5740 /* Cut back to size actually needed. This is necessary for, for example,
5741 encoding of a string containing isolated surrogates and the 'ignore'
5742 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005743 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005744 if (nsize != PyBytes_GET_SIZE(v))
5745 _PyBytes_Resize(&v, nsize);
5746 Py_XDECREF(errorHandler);
5747 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005748 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005749 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005750 error:
5751 Py_XDECREF(rep);
5752 Py_XDECREF(errorHandler);
5753 Py_XDECREF(exc);
5754 Py_XDECREF(v);
5755 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005756}
5757
Alexander Belopolsky40018472011-02-26 01:02:56 +00005758PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005759PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5760 Py_ssize_t size,
5761 const char *errors,
5762 int byteorder)
5763{
5764 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005765 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005766 if (tmp == NULL)
5767 return NULL;
5768 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5769 Py_DECREF(tmp);
5770 return result;
5771}
5772
5773PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005774PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005775{
Victor Stinnerb960b342011-11-20 19:12:52 +01005776 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005777}
5778
Guido van Rossumd57fd912000-03-10 22:53:23 +00005779/* --- UTF-16 Codec ------------------------------------------------------- */
5780
Tim Peters772747b2001-08-09 22:21:55 +00005781PyObject *
5782PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005783 Py_ssize_t size,
5784 const char *errors,
5785 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005786{
Walter Dörwald69652032004-09-07 20:24:22 +00005787 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5788}
5789
5790PyObject *
5791PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005792 Py_ssize_t size,
5793 const char *errors,
5794 int *byteorder,
5795 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005796{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005797 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005798 Py_ssize_t startinpos;
5799 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005800 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005801 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005802 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005803 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005804 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005805 PyObject *errorHandler = NULL;
5806 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005807 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005808
Tim Peters772747b2001-08-09 22:21:55 +00005809 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005810 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005811
5812 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005813 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005814
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005815 /* Check for BOM marks (U+FEFF) in the input and adjust current
5816 byte order setting accordingly. In native mode, the leading BOM
5817 mark is skipped, in all other modes, it is copied to the output
5818 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005819 if (bo == 0 && size >= 2) {
5820 const Py_UCS4 bom = (q[1] << 8) | q[0];
5821 if (bom == 0xFEFF) {
5822 q += 2;
5823 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005824 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005825 else if (bom == 0xFFFE) {
5826 q += 2;
5827 bo = 1;
5828 }
5829 if (byteorder)
5830 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005831 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005832
Antoine Pitrou63065d72012-05-15 23:48:04 +02005833 if (q == e) {
5834 if (consumed)
5835 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005836 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005837 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005838
Christian Heimes743e0cd2012-10-17 23:52:17 +02005839#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005840 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005841 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005842#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005843 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005845#endif
Tim Peters772747b2001-08-09 22:21:55 +00005846
Antoine Pitrou63065d72012-05-15 23:48:04 +02005847 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005848 character count normally. Error handler will take care of
5849 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005850 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005851 writer.min_length = (e - q + 1) / 2;
5852 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005853 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005854
Antoine Pitrou63065d72012-05-15 23:48:04 +02005855 while (1) {
5856 Py_UCS4 ch = 0;
5857 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005859 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005860 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005861 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005862 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005863 native_ordering);
5864 else
5865 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005867 native_ordering);
5868 } else if (kind == PyUnicode_2BYTE_KIND) {
5869 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005870 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005871 native_ordering);
5872 } else {
5873 assert(kind == PyUnicode_4BYTE_KIND);
5874 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005875 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005876 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005877 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005878 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005879
Antoine Pitrou63065d72012-05-15 23:48:04 +02005880 switch (ch)
5881 {
5882 case 0:
5883 /* remaining byte at the end? (size should be even) */
5884 if (q == e || consumed)
5885 goto End;
5886 errmsg = "truncated data";
5887 startinpos = ((const char *)q) - starts;
5888 endinpos = ((const char *)e) - starts;
5889 break;
5890 /* The remaining input chars are ignored if the callback
5891 chooses to skip the input */
5892 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005893 q -= 2;
5894 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005895 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005896 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005897 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005898 endinpos = ((const char *)e) - starts;
5899 break;
5900 case 2:
5901 errmsg = "illegal encoding";
5902 startinpos = ((const char *)q) - 2 - starts;
5903 endinpos = startinpos + 2;
5904 break;
5905 case 3:
5906 errmsg = "illegal UTF-16 surrogate";
5907 startinpos = ((const char *)q) - 4 - starts;
5908 endinpos = startinpos + 2;
5909 break;
5910 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005911 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005912 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 continue;
5914 }
5915
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005916 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005917 errors,
5918 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005919 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005920 &starts,
5921 (const char **)&e,
5922 &startinpos,
5923 &endinpos,
5924 &exc,
5925 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005926 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005927 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005928 }
5929
Antoine Pitrou63065d72012-05-15 23:48:04 +02005930End:
Walter Dörwald69652032004-09-07 20:24:22 +00005931 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005932 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005933
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005934 Py_XDECREF(errorHandler);
5935 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005936 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005937
Benjamin Peterson29060642009-01-31 22:14:21 +00005938 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005939 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005940 Py_XDECREF(errorHandler);
5941 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005942 return NULL;
5943}
5944
Tim Peters772747b2001-08-09 22:21:55 +00005945PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005946_PyUnicode_EncodeUTF16(PyObject *str,
5947 const char *errors,
5948 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005949{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005950 enum PyUnicode_Kind kind;
5951 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005952 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005953 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005954 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005955 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005956#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005957 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005958#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005959 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005960#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005961 const char *encoding;
5962 Py_ssize_t nsize, pos;
5963 PyObject *errorHandler = NULL;
5964 PyObject *exc = NULL;
5965 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005966
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005967 if (!PyUnicode_Check(str)) {
5968 PyErr_BadArgument();
5969 return NULL;
5970 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005971 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005972 return NULL;
5973 kind = PyUnicode_KIND(str);
5974 data = PyUnicode_DATA(str);
5975 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005976
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005977 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005978 if (kind == PyUnicode_4BYTE_KIND) {
5979 const Py_UCS4 *in = (const Py_UCS4 *)data;
5980 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005981 while (in < end) {
5982 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005983 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005984 }
5985 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005986 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005987 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005988 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005989 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005990 nsize = len + pairs + (byteorder == 0);
5991 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005992 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005993 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005994 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005995
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005996 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005997 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005998 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005999 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006000 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006001 }
6002 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006003 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006004 }
Tim Peters772747b2001-08-09 22:21:55 +00006005
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006006 if (kind == PyUnicode_1BYTE_KIND) {
6007 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6008 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006009 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006010
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006011 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006012 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006013 }
6014 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006015 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006016 }
6017 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006018 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006019 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006020
6021 pos = 0;
6022 while (pos < len) {
6023 Py_ssize_t repsize, moreunits;
6024
6025 if (kind == PyUnicode_2BYTE_KIND) {
6026 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6027 &out, native_ordering);
6028 }
6029 else {
6030 assert(kind == PyUnicode_4BYTE_KIND);
6031 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6032 &out, native_ordering);
6033 }
6034 if (pos == len)
6035 break;
6036
6037 rep = unicode_encode_call_errorhandler(
6038 errors, &errorHandler,
6039 encoding, "surrogates not allowed",
6040 str, &exc, pos, pos + 1, &pos);
6041 if (!rep)
6042 goto error;
6043
6044 if (PyBytes_Check(rep)) {
6045 repsize = PyBytes_GET_SIZE(rep);
6046 if (repsize & 1) {
6047 raise_encode_exception(&exc, encoding,
6048 str, pos - 1, pos,
6049 "surrogates not allowed");
6050 goto error;
6051 }
6052 moreunits = repsize / 2;
6053 }
6054 else {
6055 assert(PyUnicode_Check(rep));
6056 if (PyUnicode_READY(rep) < 0)
6057 goto error;
6058 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6059 if (!PyUnicode_IS_ASCII(rep)) {
6060 raise_encode_exception(&exc, encoding,
6061 str, pos - 1, pos,
6062 "surrogates not allowed");
6063 goto error;
6064 }
6065 }
6066
6067 /* two bytes are reserved for each surrogate */
6068 if (moreunits > 1) {
6069 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006070 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006071 /* integer overflow */
6072 PyErr_NoMemory();
6073 goto error;
6074 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006075 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006076 goto error;
6077 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6078 }
6079
6080 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006081 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006082 out += moreunits;
6083 } else /* rep is unicode */ {
6084 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6085 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6086 &out, native_ordering);
6087 }
6088
6089 Py_CLEAR(rep);
6090 }
6091
6092 /* Cut back to size actually needed. This is necessary for, for example,
6093 encoding of a string containing isolated surrogates and the 'ignore' handler
6094 is used. */
6095 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6096 if (nsize != PyBytes_GET_SIZE(v))
6097 _PyBytes_Resize(&v, nsize);
6098 Py_XDECREF(errorHandler);
6099 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006100 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006101 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006102 error:
6103 Py_XDECREF(rep);
6104 Py_XDECREF(errorHandler);
6105 Py_XDECREF(exc);
6106 Py_XDECREF(v);
6107 return NULL;
6108#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006109}
6110
Alexander Belopolsky40018472011-02-26 01:02:56 +00006111PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006112PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6113 Py_ssize_t size,
6114 const char *errors,
6115 int byteorder)
6116{
6117 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006118 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006119 if (tmp == NULL)
6120 return NULL;
6121 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6122 Py_DECREF(tmp);
6123 return result;
6124}
6125
6126PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006127PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006128{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006129 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006130}
6131
6132/* --- Unicode Escape Codec ----------------------------------------------- */
6133
Fredrik Lundh06d12682001-01-24 07:59:11 +00006134static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006135
Alexander Belopolsky40018472011-02-26 01:02:56 +00006136PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006137_PyUnicode_DecodeUnicodeEscape(const char *s,
6138 Py_ssize_t size,
6139 const char *errors,
6140 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006142 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006143 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006144 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006145 PyObject *errorHandler = NULL;
6146 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006147
Eric V. Smith42454af2016-10-31 09:22:08 -04006148 // so we can remember if we've seen an invalid escape char or not
6149 *first_invalid_escape = NULL;
6150
Victor Stinner62ec3312016-09-06 17:04:34 -07006151 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006152 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 }
6154 /* Escaped strings will always be longer than the resulting
6155 Unicode string, so we start with size here and then reduce the
6156 length after conversion to the true value.
6157 (but if the error callback returns a long replacement string
6158 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006159 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 writer.min_length = size;
6161 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6162 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006163 }
6164
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 end = s + size;
6166 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006167 unsigned char c = (unsigned char) *s++;
6168 Py_UCS4 ch;
6169 int count;
6170 Py_ssize_t startinpos;
6171 Py_ssize_t endinpos;
6172 const char *message;
6173
6174#define WRITE_ASCII_CHAR(ch) \
6175 do { \
6176 assert(ch <= 127); \
6177 assert(writer.pos < writer.size); \
6178 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6179 } while(0)
6180
6181#define WRITE_CHAR(ch) \
6182 do { \
6183 if (ch <= writer.maxchar) { \
6184 assert(writer.pos < writer.size); \
6185 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6186 } \
6187 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6188 goto onError; \
6189 } \
6190 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006191
6192 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 if (c != '\\') {
6194 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195 continue;
6196 }
6197
Victor Stinner62ec3312016-09-06 17:04:34 -07006198 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006200 if (s >= end) {
6201 message = "\\ at end of string";
6202 goto error;
6203 }
6204 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006205
Victor Stinner62ec3312016-09-06 17:04:34 -07006206 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006207 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006208
Benjamin Peterson29060642009-01-31 22:14:21 +00006209 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 case '\n': continue;
6211 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6212 case '\'': WRITE_ASCII_CHAR('\''); continue;
6213 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6214 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006215 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006216 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6217 case 't': WRITE_ASCII_CHAR('\t'); continue;
6218 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6219 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006220 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006222 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006223 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006224
Benjamin Peterson29060642009-01-31 22:14:21 +00006225 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006226 case '0': case '1': case '2': case '3':
6227 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006228 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006229 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 ch = (ch<<3) + *s++ - '0';
6231 if (s < end && '0' <= *s && *s <= '7') {
6232 ch = (ch<<3) + *s++ - '0';
6233 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006234 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 WRITE_CHAR(ch);
6236 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006237
Benjamin Peterson29060642009-01-31 22:14:21 +00006238 /* hex escapes */
6239 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006240 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006241 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006242 message = "truncated \\xXX escape";
6243 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244
Benjamin Peterson29060642009-01-31 22:14:21 +00006245 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006246 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006248 message = "truncated \\uXXXX escape";
6249 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250
Benjamin Peterson29060642009-01-31 22:14:21 +00006251 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006252 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006254 message = "truncated \\UXXXXXXXX escape";
6255 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006257 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 ch <<= 4;
6259 if (c >= '0' && c <= '9') {
6260 ch += c - '0';
6261 }
6262 else if (c >= 'a' && c <= 'f') {
6263 ch += c - ('a' - 10);
6264 }
6265 else if (c >= 'A' && c <= 'F') {
6266 ch += c - ('A' - 10);
6267 }
6268 else {
6269 break;
6270 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006271 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006273 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006274 }
6275
6276 /* when we get here, ch is a 32-bit unicode character */
6277 if (ch > MAX_UNICODE) {
6278 message = "illegal Unicode character";
6279 goto error;
6280 }
6281
6282 WRITE_CHAR(ch);
6283 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006284
Benjamin Peterson29060642009-01-31 22:14:21 +00006285 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006286 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006287 if (ucnhash_CAPI == NULL) {
6288 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006289 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6290 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006291 if (ucnhash_CAPI == NULL) {
6292 PyErr_SetString(
6293 PyExc_UnicodeError,
6294 "\\N escapes not supported (can't load unicodedata module)"
6295 );
6296 goto onError;
6297 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006298 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006299
6300 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006301 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006302 const char *start = ++s;
6303 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006304 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006306 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 namelen = s - start;
6308 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006309 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006310 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 ch = 0xffffffff; /* in case 'getcode' messes up */
6312 if (namelen <= INT_MAX &&
6313 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6314 &ch, 0)) {
6315 assert(ch <= MAX_UNICODE);
6316 WRITE_CHAR(ch);
6317 continue;
6318 }
6319 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006320 }
6321 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006322 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006323
6324 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006325 if (*first_invalid_escape == NULL) {
6326 *first_invalid_escape = s-1; /* Back up one char, since we've
6327 already incremented s. */
6328 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006329 WRITE_ASCII_CHAR('\\');
6330 WRITE_CHAR(c);
6331 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006332 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006333
6334 error:
6335 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006337 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006338 errors, &errorHandler,
6339 "unicodeescape", message,
6340 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006341 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006342 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006344 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006345
6346#undef WRITE_ASCII_CHAR
6347#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006349
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006350 Py_XDECREF(errorHandler);
6351 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006352 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006353
Benjamin Peterson29060642009-01-31 22:14:21 +00006354 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006355 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006356 Py_XDECREF(errorHandler);
6357 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006358 return NULL;
6359}
6360
Eric V. Smith42454af2016-10-31 09:22:08 -04006361PyObject *
6362PyUnicode_DecodeUnicodeEscape(const char *s,
6363 Py_ssize_t size,
6364 const char *errors)
6365{
6366 const char *first_invalid_escape;
6367 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6368 &first_invalid_escape);
6369 if (result == NULL)
6370 return NULL;
6371 if (first_invalid_escape != NULL) {
6372 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6373 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006374 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006375 Py_DECREF(result);
6376 return NULL;
6377 }
6378 }
6379 return result;
6380}
6381
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006382/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006383
Alexander Belopolsky40018472011-02-26 01:02:56 +00006384PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006385PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006386{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006387 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006388 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006390 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006391 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006392 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393
Ezio Melottie7f90372012-10-05 03:33:31 +03006394 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006395 escape.
6396
Ezio Melottie7f90372012-10-05 03:33:31 +03006397 For UCS1 strings it's '\xxx', 4 bytes per source character.
6398 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6399 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006400 */
6401
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006402 if (!PyUnicode_Check(unicode)) {
6403 PyErr_BadArgument();
6404 return NULL;
6405 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006406 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006407 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 }
Victor Stinner358af132015-10-12 22:36:57 +02006409
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006410 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006411 if (len == 0) {
6412 return PyBytes_FromStringAndSize(NULL, 0);
6413 }
6414
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006415 kind = PyUnicode_KIND(unicode);
6416 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006417 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6418 bytes, and 1 byte characters 4. */
6419 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006420 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 return PyErr_NoMemory();
6422 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006423 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006424 if (repr == NULL) {
6425 return NULL;
6426 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006429 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006430 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006431
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 /* U+0000-U+00ff range */
6433 if (ch < 0x100) {
6434 if (ch >= ' ' && ch < 127) {
6435 if (ch != '\\') {
6436 /* Copy printable US ASCII as-is */
6437 *p++ = (char) ch;
6438 }
6439 /* Escape backslashes */
6440 else {
6441 *p++ = '\\';
6442 *p++ = '\\';
6443 }
6444 }
Victor Stinner358af132015-10-12 22:36:57 +02006445
Victor Stinner62ec3312016-09-06 17:04:34 -07006446 /* Map special whitespace to '\t', \n', '\r' */
6447 else if (ch == '\t') {
6448 *p++ = '\\';
6449 *p++ = 't';
6450 }
6451 else if (ch == '\n') {
6452 *p++ = '\\';
6453 *p++ = 'n';
6454 }
6455 else if (ch == '\r') {
6456 *p++ = '\\';
6457 *p++ = 'r';
6458 }
6459
6460 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6461 else {
6462 *p++ = '\\';
6463 *p++ = 'x';
6464 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6465 *p++ = Py_hexdigits[ch & 0x000F];
6466 }
Tim Petersced69f82003-09-16 20:30:58 +00006467 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006468 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006470 *p++ = '\\';
6471 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006472 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6473 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6474 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6475 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006476 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006477 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6478 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006479
Victor Stinner62ec3312016-09-06 17:04:34 -07006480 /* Make sure that the first two digits are zero */
6481 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006482 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006483 *p++ = 'U';
6484 *p++ = '0';
6485 *p++ = '0';
6486 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6487 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6488 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6489 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6490 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6491 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006492 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006493 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006494
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 assert(p - PyBytes_AS_STRING(repr) > 0);
6496 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6497 return NULL;
6498 }
6499 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006500}
6501
Alexander Belopolsky40018472011-02-26 01:02:56 +00006502PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006503PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6504 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006505{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006506 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006507 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006508 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 }
6511
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006512 result = PyUnicode_AsUnicodeEscapeString(tmp);
6513 Py_DECREF(tmp);
6514 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006515}
6516
6517/* --- Raw Unicode Escape Codec ------------------------------------------- */
6518
Alexander Belopolsky40018472011-02-26 01:02:56 +00006519PyObject *
6520PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006521 Py_ssize_t size,
6522 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006523{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006524 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006525 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006526 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006527 PyObject *errorHandler = NULL;
6528 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006529
Victor Stinner62ec3312016-09-06 17:04:34 -07006530 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006531 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006532 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006533
Guido van Rossumd57fd912000-03-10 22:53:23 +00006534 /* Escaped strings will always be longer than the resulting
6535 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006536 length after conversion to the true value. (But decoding error
6537 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006538 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006539 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006540 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6541 goto onError;
6542 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006543
Guido van Rossumd57fd912000-03-10 22:53:23 +00006544 end = s + size;
6545 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006546 unsigned char c = (unsigned char) *s++;
6547 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006548 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006549 Py_ssize_t startinpos;
6550 Py_ssize_t endinpos;
6551 const char *message;
6552
6553#define WRITE_CHAR(ch) \
6554 do { \
6555 if (ch <= writer.maxchar) { \
6556 assert(writer.pos < writer.size); \
6557 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6558 } \
6559 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6560 goto onError; \
6561 } \
6562 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563
Benjamin Peterson29060642009-01-31 22:14:21 +00006564 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006565 if (c != '\\' || s >= end) {
6566 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006567 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006568 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006569
Victor Stinner62ec3312016-09-06 17:04:34 -07006570 c = (unsigned char) *s++;
6571 if (c == 'u') {
6572 count = 4;
6573 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006574 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006575 else if (c == 'U') {
6576 count = 8;
6577 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006578 }
6579 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006580 assert(writer.pos < writer.size);
6581 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6582 WRITE_CHAR(c);
6583 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006584 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 startinpos = s - starts - 2;
6586
6587 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6588 for (ch = 0; count && s < end; ++s, --count) {
6589 c = (unsigned char)*s;
6590 ch <<= 4;
6591 if (c >= '0' && c <= '9') {
6592 ch += c - '0';
6593 }
6594 else if (c >= 'a' && c <= 'f') {
6595 ch += c - ('a' - 10);
6596 }
6597 else if (c >= 'A' && c <= 'F') {
6598 ch += c - ('A' - 10);
6599 }
6600 else {
6601 break;
6602 }
6603 }
6604 if (!count) {
6605 if (ch <= MAX_UNICODE) {
6606 WRITE_CHAR(ch);
6607 continue;
6608 }
6609 message = "\\Uxxxxxxxx out of range";
6610 }
6611
6612 endinpos = s-starts;
6613 writer.min_length = end - s + writer.pos;
6614 if (unicode_decode_call_errorhandler_writer(
6615 errors, &errorHandler,
6616 "rawunicodeescape", message,
6617 &starts, &end, &startinpos, &endinpos, &exc, &s,
6618 &writer)) {
6619 goto onError;
6620 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006621 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006622
6623#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006624 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006625 Py_XDECREF(errorHandler);
6626 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006627 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006628
Benjamin Peterson29060642009-01-31 22:14:21 +00006629 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006630 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006631 Py_XDECREF(errorHandler);
6632 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006634
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635}
6636
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006637
Alexander Belopolsky40018472011-02-26 01:02:56 +00006638PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006639PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640{
Victor Stinner62ec3312016-09-06 17:04:34 -07006641 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006642 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006643 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 int kind;
6645 void *data;
6646 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006648 if (!PyUnicode_Check(unicode)) {
6649 PyErr_BadArgument();
6650 return NULL;
6651 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006652 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006653 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006654 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006655 kind = PyUnicode_KIND(unicode);
6656 data = PyUnicode_DATA(unicode);
6657 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006658 if (kind == PyUnicode_1BYTE_KIND) {
6659 return PyBytes_FromStringAndSize(data, len);
6660 }
Victor Stinner0e368262011-11-10 20:12:49 +01006661
Victor Stinner62ec3312016-09-06 17:04:34 -07006662 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6663 bytes, and 1 byte characters 4. */
6664 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006665
Victor Stinner62ec3312016-09-06 17:04:34 -07006666 if (len > PY_SSIZE_T_MAX / expandsize) {
6667 return PyErr_NoMemory();
6668 }
6669 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6670 if (repr == NULL) {
6671 return NULL;
6672 }
6673 if (len == 0) {
6674 return repr;
6675 }
6676
6677 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006678 for (pos = 0; pos < len; pos++) {
6679 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006680
Victor Stinner62ec3312016-09-06 17:04:34 -07006681 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6682 if (ch < 0x100) {
6683 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006684 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006685 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006686 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006687 *p++ = '\\';
6688 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006689 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6690 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6691 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6692 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006693 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006694 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6695 else {
6696 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6697 *p++ = '\\';
6698 *p++ = 'U';
6699 *p++ = '0';
6700 *p++ = '0';
6701 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6702 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6703 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6704 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6705 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6706 *p++ = Py_hexdigits[ch & 15];
6707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006708 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006709
Victor Stinner62ec3312016-09-06 17:04:34 -07006710 assert(p > PyBytes_AS_STRING(repr));
6711 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6712 return NULL;
6713 }
6714 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006715}
6716
Alexander Belopolsky40018472011-02-26 01:02:56 +00006717PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006718PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6719 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006720{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006721 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006722 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006723 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006724 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006725 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6726 Py_DECREF(tmp);
6727 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006728}
6729
6730/* --- Latin-1 Codec ------------------------------------------------------ */
6731
Alexander Belopolsky40018472011-02-26 01:02:56 +00006732PyObject *
6733PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006734 Py_ssize_t size,
6735 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006736{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006737 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006738 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006739}
6740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006742static void
6743make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006744 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006745 PyObject *unicode,
6746 Py_ssize_t startpos, Py_ssize_t endpos,
6747 const char *reason)
6748{
6749 if (*exceptionObject == NULL) {
6750 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006751 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006752 encoding, unicode, startpos, endpos, reason);
6753 }
6754 else {
6755 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6756 goto onError;
6757 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6758 goto onError;
6759 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6760 goto onError;
6761 return;
6762 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006763 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006764 }
6765}
6766
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006767/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006768static void
6769raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006770 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006771 PyObject *unicode,
6772 Py_ssize_t startpos, Py_ssize_t endpos,
6773 const char *reason)
6774{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006775 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006776 encoding, unicode, startpos, endpos, reason);
6777 if (*exceptionObject != NULL)
6778 PyCodec_StrictErrors(*exceptionObject);
6779}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006780
6781/* error handling callback helper:
6782 build arguments, call the callback and check the arguments,
6783 put the result into newpos and return the replacement string, which
6784 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006785static PyObject *
6786unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006787 PyObject **errorHandler,
6788 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006789 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006790 Py_ssize_t startpos, Py_ssize_t endpos,
6791 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006792{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006793 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006795 PyObject *restuple;
6796 PyObject *resunicode;
6797
6798 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006800 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006802 }
6803
Benjamin Petersonbac79492012-01-14 13:34:47 -05006804 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 return NULL;
6806 len = PyUnicode_GET_LENGTH(unicode);
6807
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006808 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006809 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006810 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006812
Jeroen Demeyer196a5302019-07-04 12:31:34 +02006813 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006814 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006816 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006817 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 Py_DECREF(restuple);
6819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006820 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006821 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 &resunicode, newpos)) {
6823 Py_DECREF(restuple);
6824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006825 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006826 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6827 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6828 Py_DECREF(restuple);
6829 return NULL;
6830 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006831 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006832 *newpos = len + *newpos;
6833 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006834 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 Py_DECREF(restuple);
6836 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006837 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006838 Py_INCREF(resunicode);
6839 Py_DECREF(restuple);
6840 return resunicode;
6841}
6842
Alexander Belopolsky40018472011-02-26 01:02:56 +00006843static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006844unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006845 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006846 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006847{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006848 /* input state */
6849 Py_ssize_t pos=0, size;
6850 int kind;
6851 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006852 /* pointer into the output */
6853 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006854 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6855 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006856 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006858 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006859 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006860 /* output object */
6861 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862
Benjamin Petersonbac79492012-01-14 13:34:47 -05006863 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006864 return NULL;
6865 size = PyUnicode_GET_LENGTH(unicode);
6866 kind = PyUnicode_KIND(unicode);
6867 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 /* allocate enough for a simple encoding without
6869 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006870 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006871 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006872
6873 _PyBytesWriter_Init(&writer);
6874 str = _PyBytesWriter_Alloc(&writer, size);
6875 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006876 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006877
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006878 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006879 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006880
Benjamin Peterson29060642009-01-31 22:14:21 +00006881 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006882 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006883 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006884 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006885 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006886 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006888 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006890 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006891 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006892 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006893
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006894 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006895 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006896
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006897 /* Only overallocate the buffer if it's not the last write */
6898 writer.overallocate = (collend < size);
6899
Benjamin Peterson29060642009-01-31 22:14:21 +00006900 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006901 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006902 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006903
6904 switch (error_handler) {
6905 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006906 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006907 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006908
6909 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006910 memset(str, '?', collend - collstart);
6911 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006912 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006913 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006914 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006915 break;
Victor Stinner50149202015-09-22 00:26:54 +02006916
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006917 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006918 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006919 writer.min_size -= (collend - collstart);
6920 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006921 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006922 if (str == NULL)
6923 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006924 pos = collend;
6925 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006926
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006927 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006928 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006929 writer.min_size -= (collend - collstart);
6930 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006931 unicode, collstart, collend);
6932 if (str == NULL)
6933 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006934 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006935 break;
Victor Stinner50149202015-09-22 00:26:54 +02006936
Victor Stinnerc3713e92015-09-29 12:32:13 +02006937 case _Py_ERROR_SURROGATEESCAPE:
6938 for (i = collstart; i < collend; ++i) {
6939 ch = PyUnicode_READ(kind, data, i);
6940 if (ch < 0xdc80 || 0xdcff < ch) {
6941 /* Not a UTF-8b surrogate */
6942 break;
6943 }
6944 *str++ = (char)(ch - 0xdc00);
6945 ++pos;
6946 }
6947 if (i >= collend)
6948 break;
6949 collstart = pos;
6950 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006951 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006952
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006954 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6955 encoding, reason, unicode, &exc,
6956 collstart, collend, &newpos);
6957 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006958 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006959
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006960 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006961 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006962
Victor Stinner6bd525b2015-10-09 13:10:05 +02006963 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006964 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006965 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006966 PyBytes_AS_STRING(rep),
6967 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006968 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006969 else {
6970 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006971
Victor Stinner6bd525b2015-10-09 13:10:05 +02006972 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006973 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006974
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006975 if (limit == 256 ?
6976 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6977 !PyUnicode_IS_ASCII(rep))
6978 {
6979 /* Not all characters are smaller than limit */
6980 raise_encode_exception(&exc, encoding, unicode,
6981 collstart, collend, reason);
6982 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006983 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006984 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6985 str = _PyBytesWriter_WriteBytes(&writer, str,
6986 PyUnicode_DATA(rep),
6987 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006988 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006989 if (str == NULL)
6990 goto onError;
6991
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006992 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006993 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006994 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006995
6996 /* If overallocation was disabled, ensure that it was the last
6997 write. Otherwise, we missed an optimization */
6998 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006999 }
7000 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007001
Victor Stinner50149202015-09-22 00:26:54 +02007002 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007003 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007004 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007005
7006 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007007 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007008 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007009 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007010 Py_XDECREF(exc);
7011 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007012}
7013
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007014/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007015PyObject *
7016PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007017 Py_ssize_t size,
7018 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007019{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007020 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007021 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007022 if (unicode == NULL)
7023 return NULL;
7024 result = unicode_encode_ucs1(unicode, errors, 256);
7025 Py_DECREF(unicode);
7026 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007027}
7028
Alexander Belopolsky40018472011-02-26 01:02:56 +00007029PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007030_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031{
7032 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007033 PyErr_BadArgument();
7034 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007036 if (PyUnicode_READY(unicode) == -1)
7037 return NULL;
7038 /* Fast path: if it is a one-byte string, construct
7039 bytes object directly. */
7040 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7041 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7042 PyUnicode_GET_LENGTH(unicode));
7043 /* Non-Latin-1 characters present. Defer to above function to
7044 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007045 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007046}
7047
7048PyObject*
7049PyUnicode_AsLatin1String(PyObject *unicode)
7050{
7051 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007052}
7053
7054/* --- 7-bit ASCII Codec -------------------------------------------------- */
7055
Alexander Belopolsky40018472011-02-26 01:02:56 +00007056PyObject *
7057PyUnicode_DecodeASCII(const char *s,
7058 Py_ssize_t size,
7059 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007062 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007063 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007064 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007065 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007066
Guido van Rossumd57fd912000-03-10 22:53:23 +00007067 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007068 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007069
Guido van Rossumd57fd912000-03-10 22:53:23 +00007070 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007071 if (size == 1 && (unsigned char)s[0] < 128)
7072 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007073
Inada Naoki770847a2019-06-24 12:30:24 +09007074 // Shortcut for simple case
7075 PyObject *u = PyUnicode_New(size, 127);
7076 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007077 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007078 }
7079 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7080 if (outpos == size) {
7081 return u;
7082 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007083
Inada Naoki770847a2019-06-24 12:30:24 +09007084 _PyUnicodeWriter writer;
7085 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007086 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007087
Inada Naoki770847a2019-06-24 12:30:24 +09007088 s += outpos;
7089 int kind = writer.kind;
7090 void *data = writer.data;
7091 Py_ssize_t startinpos, endinpos;
7092
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007093 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007094 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007095 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007096 PyUnicode_WRITE(kind, data, writer.pos, c);
7097 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007098 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007099 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007101
7102 /* byte outsize range 0x00..0x7f: call the error handler */
7103
7104 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007105 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007106
7107 switch (error_handler)
7108 {
7109 case _Py_ERROR_REPLACE:
7110 case _Py_ERROR_SURROGATEESCAPE:
7111 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007112 but we may switch to UCS2 at the first write */
7113 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7114 goto onError;
7115 kind = writer.kind;
7116 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007117
7118 if (error_handler == _Py_ERROR_REPLACE)
7119 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7120 else
7121 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7122 writer.pos++;
7123 ++s;
7124 break;
7125
7126 case _Py_ERROR_IGNORE:
7127 ++s;
7128 break;
7129
7130 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007131 startinpos = s-starts;
7132 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007133 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007134 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 "ascii", "ordinal not in range(128)",
7136 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007137 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007138 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007139 kind = writer.kind;
7140 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007141 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007142 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007143 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007144 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007145 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007146
Benjamin Peterson29060642009-01-31 22:14:21 +00007147 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007148 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007149 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007150 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007151 return NULL;
7152}
7153
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007154/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007155PyObject *
7156PyUnicode_EncodeASCII(const Py_UNICODE *p,
7157 Py_ssize_t size,
7158 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007159{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007160 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007161 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007162 if (unicode == NULL)
7163 return NULL;
7164 result = unicode_encode_ucs1(unicode, errors, 128);
7165 Py_DECREF(unicode);
7166 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007167}
7168
Alexander Belopolsky40018472011-02-26 01:02:56 +00007169PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007170_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171{
7172 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007173 PyErr_BadArgument();
7174 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007176 if (PyUnicode_READY(unicode) == -1)
7177 return NULL;
7178 /* Fast path: if it is an ASCII-only string, construct bytes object
7179 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007180 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007181 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7182 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007183 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007184}
7185
7186PyObject *
7187PyUnicode_AsASCIIString(PyObject *unicode)
7188{
7189 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007190}
7191
Steve Dowercc16be82016-09-08 10:35:16 -07007192#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007193
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007194/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007195
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007196#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197#define NEED_RETRY
7198#endif
7199
Steve Dower7ebdda02019-08-21 16:22:33 -07007200/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7201 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7202 both cases also and avoids partial characters overrunning the
7203 length limit in MultiByteToWideChar on Windows */
7204#define DECODING_CHUNK_SIZE (INT_MAX/4)
7205
Victor Stinner3a50e702011-10-18 21:21:00 +02007206#ifndef WC_ERR_INVALID_CHARS
7207# define WC_ERR_INVALID_CHARS 0x0080
7208#endif
7209
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007210static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007211code_page_name(UINT code_page, PyObject **obj)
7212{
7213 *obj = NULL;
7214 if (code_page == CP_ACP)
7215 return "mbcs";
7216 if (code_page == CP_UTF7)
7217 return "CP_UTF7";
7218 if (code_page == CP_UTF8)
7219 return "CP_UTF8";
7220
7221 *obj = PyBytes_FromFormat("cp%u", code_page);
7222 if (*obj == NULL)
7223 return NULL;
7224 return PyBytes_AS_STRING(*obj);
7225}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007226
Victor Stinner3a50e702011-10-18 21:21:00 +02007227static DWORD
7228decode_code_page_flags(UINT code_page)
7229{
7230 if (code_page == CP_UTF7) {
7231 /* The CP_UTF7 decoder only supports flags=0 */
7232 return 0;
7233 }
7234 else
7235 return MB_ERR_INVALID_CHARS;
7236}
7237
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007238/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 * Decode a byte string from a Windows code page into unicode object in strict
7240 * mode.
7241 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007242 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7243 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007244 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007245static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007246decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007247 wchar_t **buf,
7248 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 const char *in,
7250 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007251{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007252 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007253 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255
7256 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007258 while ((outsize = MultiByteToWideChar(code_page, flags,
7259 in, insize, NULL, 0)) <= 0)
7260 {
7261 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7262 goto error;
7263 }
7264 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7265 flags = 0;
7266 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007267
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007268 /* Extend a wchar_t* buffer */
7269 Py_ssize_t n = *bufsize; /* Get the current length */
7270 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7271 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007272 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007273 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007274
7275 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7277 if (outsize <= 0)
7278 goto error;
7279 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007280
Victor Stinner3a50e702011-10-18 21:21:00 +02007281error:
7282 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7283 return -2;
7284 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007285 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007286}
7287
Victor Stinner3a50e702011-10-18 21:21:00 +02007288/*
7289 * Decode a byte string from a code page into unicode object with an error
7290 * handler.
7291 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007292 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007293 * UnicodeDecodeError exception and returns -1 on error.
7294 */
7295static int
7296decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007297 wchar_t **buf,
7298 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007299 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007300 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007301{
7302 const char *startin = in;
7303 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007304 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007305 /* Ideally, we should get reason from FormatMessage. This is the Windows
7306 2000 English version of the message. */
7307 const char *reason = "No mapping for the Unicode character exists "
7308 "in the target code page.";
7309 /* each step cannot decode more than 1 character, but a character can be
7310 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007311 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007312 int insize;
7313 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 PyObject *errorHandler = NULL;
7315 PyObject *exc = NULL;
7316 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007317 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 DWORD err;
7319 int ret = -1;
7320
7321 assert(size > 0);
7322
7323 encoding = code_page_name(code_page, &encoding_obj);
7324 if (encoding == NULL)
7325 return -1;
7326
Victor Stinner7d00cc12014-03-17 23:08:06 +01007327 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007328 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7329 UnicodeDecodeError. */
7330 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7331 if (exc != NULL) {
7332 PyCodec_StrictErrors(exc);
7333 Py_CLEAR(exc);
7334 }
7335 goto error;
7336 }
7337
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007338 /* Extend a wchar_t* buffer */
7339 Py_ssize_t n = *bufsize; /* Get the current length */
7340 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7341 PyErr_NoMemory();
7342 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007343 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007344 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7345 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007346 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007347 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007348
7349 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 while (in < endin)
7351 {
7352 /* Decode a character */
7353 insize = 1;
7354 do
7355 {
7356 outsize = MultiByteToWideChar(code_page, flags,
7357 in, insize,
7358 buffer, Py_ARRAY_LENGTH(buffer));
7359 if (outsize > 0)
7360 break;
7361 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007362 if (err == ERROR_INVALID_FLAGS && flags) {
7363 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7364 flags = 0;
7365 continue;
7366 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007367 if (err != ERROR_NO_UNICODE_TRANSLATION
7368 && err != ERROR_INSUFFICIENT_BUFFER)
7369 {
7370 PyErr_SetFromWindowsErr(0);
7371 goto error;
7372 }
7373 insize++;
7374 }
7375 /* 4=maximum length of a UTF-8 sequence */
7376 while (insize <= 4 && (in + insize) <= endin);
7377
7378 if (outsize <= 0) {
7379 Py_ssize_t startinpos, endinpos, outpos;
7380
Victor Stinner7d00cc12014-03-17 23:08:06 +01007381 /* last character in partial decode? */
7382 if (in + insize >= endin && !final)
7383 break;
7384
Victor Stinner3a50e702011-10-18 21:21:00 +02007385 startinpos = in - startin;
7386 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007387 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007388 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 errors, &errorHandler,
7390 encoding, reason,
7391 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007392 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 {
7394 goto error;
7395 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007396 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 }
7398 else {
7399 in += insize;
7400 memcpy(out, buffer, outsize * sizeof(wchar_t));
7401 out += outsize;
7402 }
7403 }
7404
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007405 /* Shrink the buffer */
7406 assert(out - *buf <= *bufsize);
7407 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007408 /* (in - startin) <= size and size is an int */
7409 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007410
7411error:
7412 Py_XDECREF(encoding_obj);
7413 Py_XDECREF(errorHandler);
7414 Py_XDECREF(exc);
7415 return ret;
7416}
7417
Victor Stinner3a50e702011-10-18 21:21:00 +02007418static PyObject *
7419decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007420 const char *s, Py_ssize_t size,
7421 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007422{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007423 wchar_t *buf = NULL;
7424 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007425 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007426
Victor Stinner3a50e702011-10-18 21:21:00 +02007427 if (code_page < 0) {
7428 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7429 return NULL;
7430 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007431 if (size < 0) {
7432 PyErr_BadInternalCall();
7433 return NULL;
7434 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007435
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007436 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007437 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007438
Victor Stinner76a31a62011-11-04 00:05:13 +01007439 do
7440 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007441#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007442 if (size > DECODING_CHUNK_SIZE) {
7443 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007444 final = 0;
7445 done = 0;
7446 }
7447 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007449 {
7450 chunk_size = (int)size;
7451 final = (consumed == NULL);
7452 done = 1;
7453 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007454
Victor Stinner76a31a62011-11-04 00:05:13 +01007455 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007456 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007457 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007458 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007460
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007461 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 s, chunk_size);
7463 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007464 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007465 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007466 errors, final);
7467 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007468
7469 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007470 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007471 return NULL;
7472 }
7473
7474 if (consumed)
7475 *consumed += converted;
7476
7477 s += converted;
7478 size -= converted;
7479 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007480
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007481 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7482 PyMem_Free(buf);
7483 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007484}
7485
Alexander Belopolsky40018472011-02-26 01:02:56 +00007486PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007487PyUnicode_DecodeCodePageStateful(int code_page,
7488 const char *s,
7489 Py_ssize_t size,
7490 const char *errors,
7491 Py_ssize_t *consumed)
7492{
7493 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7494}
7495
7496PyObject *
7497PyUnicode_DecodeMBCSStateful(const char *s,
7498 Py_ssize_t size,
7499 const char *errors,
7500 Py_ssize_t *consumed)
7501{
7502 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7503}
7504
7505PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007506PyUnicode_DecodeMBCS(const char *s,
7507 Py_ssize_t size,
7508 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007509{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007510 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7511}
7512
Victor Stinner3a50e702011-10-18 21:21:00 +02007513static DWORD
7514encode_code_page_flags(UINT code_page, const char *errors)
7515{
7516 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007517 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 }
7519 else if (code_page == CP_UTF7) {
7520 /* CP_UTF7 only supports flags=0 */
7521 return 0;
7522 }
7523 else {
7524 if (errors != NULL && strcmp(errors, "replace") == 0)
7525 return 0;
7526 else
7527 return WC_NO_BEST_FIT_CHARS;
7528 }
7529}
7530
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007531/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 * Encode a Unicode string to a Windows code page into a byte string in strict
7533 * mode.
7534 *
7535 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007536 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007537 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007538static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007539encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007540 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007542{
Victor Stinner554f3f02010-06-16 23:33:54 +00007543 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 BOOL *pusedDefaultChar = &usedDefaultChar;
7545 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007546 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007547 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 const DWORD flags = encode_code_page_flags(code_page, NULL);
7549 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007550 /* Create a substring so that we can get the UTF-16 representation
7551 of just the slice under consideration. */
7552 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007553
Martin v. Löwis3d325192011-11-04 18:23:06 +01007554 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007555
Victor Stinner3a50e702011-10-18 21:21:00 +02007556 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007557 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007559 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007560
Victor Stinner2fc507f2011-11-04 20:06:39 +01007561 substring = PyUnicode_Substring(unicode, offset, offset+len);
7562 if (substring == NULL)
7563 return -1;
7564 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7565 if (p == NULL) {
7566 Py_DECREF(substring);
7567 return -1;
7568 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007569 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007570
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007571 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007572 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007573 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007574 NULL, 0,
7575 NULL, pusedDefaultChar);
7576 if (outsize <= 0)
7577 goto error;
7578 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007579 if (pusedDefaultChar && *pusedDefaultChar) {
7580 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007582 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007583
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007585 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007586 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007587 if (*outbytes == NULL) {
7588 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007590 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007591 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007592 }
7593 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007594 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 const Py_ssize_t n = PyBytes_Size(*outbytes);
7596 if (outsize > PY_SSIZE_T_MAX - n) {
7597 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007598 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007599 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007600 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007601 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7602 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007603 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007604 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007605 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007606 }
7607
7608 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007610 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 out, outsize,
7612 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007613 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007614 if (outsize <= 0)
7615 goto error;
7616 if (pusedDefaultChar && *pusedDefaultChar)
7617 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007618 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007619
Victor Stinner3a50e702011-10-18 21:21:00 +02007620error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007621 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007622 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7623 return -2;
7624 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007625 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007626}
7627
Victor Stinner3a50e702011-10-18 21:21:00 +02007628/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007629 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 * error handler.
7631 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007632 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007633 * -1 on other error.
7634 */
7635static int
7636encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007637 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007638 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007639{
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007641 Py_ssize_t pos = unicode_offset;
7642 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007643 /* Ideally, we should get reason from FormatMessage. This is the Windows
7644 2000 English version of the message. */
7645 const char *reason = "invalid character";
7646 /* 4=maximum length of a UTF-8 sequence */
7647 char buffer[4];
7648 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7649 Py_ssize_t outsize;
7650 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 PyObject *errorHandler = NULL;
7652 PyObject *exc = NULL;
7653 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007654 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007655 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 PyObject *rep;
7657 int ret = -1;
7658
7659 assert(insize > 0);
7660
7661 encoding = code_page_name(code_page, &encoding_obj);
7662 if (encoding == NULL)
7663 return -1;
7664
7665 if (errors == NULL || strcmp(errors, "strict") == 0) {
7666 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7667 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007668 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007669 if (exc != NULL) {
7670 PyCodec_StrictErrors(exc);
7671 Py_DECREF(exc);
7672 }
7673 Py_XDECREF(encoding_obj);
7674 return -1;
7675 }
7676
7677 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7678 pusedDefaultChar = &usedDefaultChar;
7679 else
7680 pusedDefaultChar = NULL;
7681
7682 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7683 PyErr_NoMemory();
7684 goto error;
7685 }
7686 outsize = insize * Py_ARRAY_LENGTH(buffer);
7687
7688 if (*outbytes == NULL) {
7689 /* Create string object */
7690 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7691 if (*outbytes == NULL)
7692 goto error;
7693 out = PyBytes_AS_STRING(*outbytes);
7694 }
7695 else {
7696 /* Extend string object */
7697 Py_ssize_t n = PyBytes_Size(*outbytes);
7698 if (n > PY_SSIZE_T_MAX - outsize) {
7699 PyErr_NoMemory();
7700 goto error;
7701 }
7702 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7703 goto error;
7704 out = PyBytes_AS_STRING(*outbytes) + n;
7705 }
7706
7707 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007708 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007709 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007710 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7711 wchar_t chars[2];
7712 int charsize;
7713 if (ch < 0x10000) {
7714 chars[0] = (wchar_t)ch;
7715 charsize = 1;
7716 }
7717 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007718 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7719 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007720 charsize = 2;
7721 }
7722
Victor Stinner3a50e702011-10-18 21:21:00 +02007723 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007724 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007725 buffer, Py_ARRAY_LENGTH(buffer),
7726 NULL, pusedDefaultChar);
7727 if (outsize > 0) {
7728 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7729 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007730 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007731 memcpy(out, buffer, outsize);
7732 out += outsize;
7733 continue;
7734 }
7735 }
7736 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7737 PyErr_SetFromWindowsErr(0);
7738 goto error;
7739 }
7740
Victor Stinner3a50e702011-10-18 21:21:00 +02007741 rep = unicode_encode_call_errorhandler(
7742 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007743 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007744 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007745 if (rep == NULL)
7746 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007747 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007748
7749 if (PyBytes_Check(rep)) {
7750 outsize = PyBytes_GET_SIZE(rep);
7751 if (outsize != 1) {
7752 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7753 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7754 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7755 Py_DECREF(rep);
7756 goto error;
7757 }
7758 out = PyBytes_AS_STRING(*outbytes) + offset;
7759 }
7760 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7761 out += outsize;
7762 }
7763 else {
7764 Py_ssize_t i;
7765 enum PyUnicode_Kind kind;
7766 void *data;
7767
Benjamin Petersonbac79492012-01-14 13:34:47 -05007768 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007769 Py_DECREF(rep);
7770 goto error;
7771 }
7772
7773 outsize = PyUnicode_GET_LENGTH(rep);
7774 if (outsize != 1) {
7775 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7776 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7777 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7778 Py_DECREF(rep);
7779 goto error;
7780 }
7781 out = PyBytes_AS_STRING(*outbytes) + offset;
7782 }
7783 kind = PyUnicode_KIND(rep);
7784 data = PyUnicode_DATA(rep);
7785 for (i=0; i < outsize; i++) {
7786 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7787 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007788 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007789 encoding, unicode,
7790 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007791 "unable to encode error handler result to ASCII");
7792 Py_DECREF(rep);
7793 goto error;
7794 }
7795 *out = (unsigned char)ch;
7796 out++;
7797 }
7798 }
7799 Py_DECREF(rep);
7800 }
7801 /* write a NUL byte */
7802 *out = 0;
7803 outsize = out - PyBytes_AS_STRING(*outbytes);
7804 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7805 if (_PyBytes_Resize(outbytes, outsize) < 0)
7806 goto error;
7807 ret = 0;
7808
7809error:
7810 Py_XDECREF(encoding_obj);
7811 Py_XDECREF(errorHandler);
7812 Py_XDECREF(exc);
7813 return ret;
7814}
7815
Victor Stinner3a50e702011-10-18 21:21:00 +02007816static PyObject *
7817encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007818 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007819 const char *errors)
7820{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007821 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007822 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007823 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007824 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007825
Victor Stinner29dacf22015-01-26 16:41:32 +01007826 if (!PyUnicode_Check(unicode)) {
7827 PyErr_BadArgument();
7828 return NULL;
7829 }
7830
Benjamin Petersonbac79492012-01-14 13:34:47 -05007831 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007832 return NULL;
7833 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007834
Victor Stinner3a50e702011-10-18 21:21:00 +02007835 if (code_page < 0) {
7836 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7837 return NULL;
7838 }
7839
Martin v. Löwis3d325192011-11-04 18:23:06 +01007840 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007841 return PyBytes_FromStringAndSize(NULL, 0);
7842
Victor Stinner7581cef2011-11-03 22:32:33 +01007843 offset = 0;
7844 do
7845 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007846#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007847 if (len > DECODING_CHUNK_SIZE) {
7848 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007849 done = 0;
7850 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007851 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007852#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007853 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007854 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007855 done = 1;
7856 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007857
Victor Stinner76a31a62011-11-04 00:05:13 +01007858 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007859 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007860 errors);
7861 if (ret == -2)
7862 ret = encode_code_page_errors(code_page, &outbytes,
7863 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007864 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007865 if (ret < 0) {
7866 Py_XDECREF(outbytes);
7867 return NULL;
7868 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007869
Victor Stinner7581cef2011-11-03 22:32:33 +01007870 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007871 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007872 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007873
Victor Stinner3a50e702011-10-18 21:21:00 +02007874 return outbytes;
7875}
7876
7877PyObject *
7878PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7879 Py_ssize_t size,
7880 const char *errors)
7881{
Victor Stinner7581cef2011-11-03 22:32:33 +01007882 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007883 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007884 if (unicode == NULL)
7885 return NULL;
7886 res = encode_code_page(CP_ACP, unicode, errors);
7887 Py_DECREF(unicode);
7888 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007889}
7890
7891PyObject *
7892PyUnicode_EncodeCodePage(int code_page,
7893 PyObject *unicode,
7894 const char *errors)
7895{
Victor Stinner7581cef2011-11-03 22:32:33 +01007896 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007897}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007898
Alexander Belopolsky40018472011-02-26 01:02:56 +00007899PyObject *
7900PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007901{
Victor Stinner7581cef2011-11-03 22:32:33 +01007902 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007903}
7904
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007905#undef NEED_RETRY
7906
Steve Dowercc16be82016-09-08 10:35:16 -07007907#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007908
Guido van Rossumd57fd912000-03-10 22:53:23 +00007909/* --- Character Mapping Codec -------------------------------------------- */
7910
Victor Stinnerfb161b12013-04-18 01:44:27 +02007911static int
7912charmap_decode_string(const char *s,
7913 Py_ssize_t size,
7914 PyObject *mapping,
7915 const char *errors,
7916 _PyUnicodeWriter *writer)
7917{
7918 const char *starts = s;
7919 const char *e;
7920 Py_ssize_t startinpos, endinpos;
7921 PyObject *errorHandler = NULL, *exc = NULL;
7922 Py_ssize_t maplen;
7923 enum PyUnicode_Kind mapkind;
7924 void *mapdata;
7925 Py_UCS4 x;
7926 unsigned char ch;
7927
7928 if (PyUnicode_READY(mapping) == -1)
7929 return -1;
7930
7931 maplen = PyUnicode_GET_LENGTH(mapping);
7932 mapdata = PyUnicode_DATA(mapping);
7933 mapkind = PyUnicode_KIND(mapping);
7934
7935 e = s + size;
7936
7937 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7938 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7939 * is disabled in encoding aliases, latin1 is preferred because
7940 * its implementation is faster. */
7941 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7942 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7943 Py_UCS4 maxchar = writer->maxchar;
7944
7945 assert (writer->kind == PyUnicode_1BYTE_KIND);
7946 while (s < e) {
7947 ch = *s;
7948 x = mapdata_ucs1[ch];
7949 if (x > maxchar) {
7950 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7951 goto onError;
7952 maxchar = writer->maxchar;
7953 outdata = (Py_UCS1 *)writer->data;
7954 }
7955 outdata[writer->pos] = x;
7956 writer->pos++;
7957 ++s;
7958 }
7959 return 0;
7960 }
7961
7962 while (s < e) {
7963 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7964 enum PyUnicode_Kind outkind = writer->kind;
7965 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7966 if (outkind == PyUnicode_1BYTE_KIND) {
7967 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7968 Py_UCS4 maxchar = writer->maxchar;
7969 while (s < e) {
7970 ch = *s;
7971 x = mapdata_ucs2[ch];
7972 if (x > maxchar)
7973 goto Error;
7974 outdata[writer->pos] = x;
7975 writer->pos++;
7976 ++s;
7977 }
7978 break;
7979 }
7980 else if (outkind == PyUnicode_2BYTE_KIND) {
7981 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7982 while (s < e) {
7983 ch = *s;
7984 x = mapdata_ucs2[ch];
7985 if (x == 0xFFFE)
7986 goto Error;
7987 outdata[writer->pos] = x;
7988 writer->pos++;
7989 ++s;
7990 }
7991 break;
7992 }
7993 }
7994 ch = *s;
7995
7996 if (ch < maplen)
7997 x = PyUnicode_READ(mapkind, mapdata, ch);
7998 else
7999 x = 0xfffe; /* invalid value */
8000Error:
8001 if (x == 0xfffe)
8002 {
8003 /* undefined mapping */
8004 startinpos = s-starts;
8005 endinpos = startinpos+1;
8006 if (unicode_decode_call_errorhandler_writer(
8007 errors, &errorHandler,
8008 "charmap", "character maps to <undefined>",
8009 &starts, &e, &startinpos, &endinpos, &exc, &s,
8010 writer)) {
8011 goto onError;
8012 }
8013 continue;
8014 }
8015
8016 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8017 goto onError;
8018 ++s;
8019 }
8020 Py_XDECREF(errorHandler);
8021 Py_XDECREF(exc);
8022 return 0;
8023
8024onError:
8025 Py_XDECREF(errorHandler);
8026 Py_XDECREF(exc);
8027 return -1;
8028}
8029
8030static int
8031charmap_decode_mapping(const char *s,
8032 Py_ssize_t size,
8033 PyObject *mapping,
8034 const char *errors,
8035 _PyUnicodeWriter *writer)
8036{
8037 const char *starts = s;
8038 const char *e;
8039 Py_ssize_t startinpos, endinpos;
8040 PyObject *errorHandler = NULL, *exc = NULL;
8041 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008042 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008043
8044 e = s + size;
8045
8046 while (s < e) {
8047 ch = *s;
8048
8049 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8050 key = PyLong_FromLong((long)ch);
8051 if (key == NULL)
8052 goto onError;
8053
8054 item = PyObject_GetItem(mapping, key);
8055 Py_DECREF(key);
8056 if (item == NULL) {
8057 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8058 /* No mapping found means: mapping is undefined. */
8059 PyErr_Clear();
8060 goto Undefined;
8061 } else
8062 goto onError;
8063 }
8064
8065 /* Apply mapping */
8066 if (item == Py_None)
8067 goto Undefined;
8068 if (PyLong_Check(item)) {
8069 long value = PyLong_AS_LONG(item);
8070 if (value == 0xFFFE)
8071 goto Undefined;
8072 if (value < 0 || value > MAX_UNICODE) {
8073 PyErr_Format(PyExc_TypeError,
8074 "character mapping must be in range(0x%lx)",
8075 (unsigned long)MAX_UNICODE + 1);
8076 goto onError;
8077 }
8078
8079 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8080 goto onError;
8081 }
8082 else if (PyUnicode_Check(item)) {
8083 if (PyUnicode_READY(item) == -1)
8084 goto onError;
8085 if (PyUnicode_GET_LENGTH(item) == 1) {
8086 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8087 if (value == 0xFFFE)
8088 goto Undefined;
8089 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8090 goto onError;
8091 }
8092 else {
8093 writer->overallocate = 1;
8094 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8095 goto onError;
8096 }
8097 }
8098 else {
8099 /* wrong return value */
8100 PyErr_SetString(PyExc_TypeError,
8101 "character mapping must return integer, None or str");
8102 goto onError;
8103 }
8104 Py_CLEAR(item);
8105 ++s;
8106 continue;
8107
8108Undefined:
8109 /* undefined mapping */
8110 Py_CLEAR(item);
8111 startinpos = s-starts;
8112 endinpos = startinpos+1;
8113 if (unicode_decode_call_errorhandler_writer(
8114 errors, &errorHandler,
8115 "charmap", "character maps to <undefined>",
8116 &starts, &e, &startinpos, &endinpos, &exc, &s,
8117 writer)) {
8118 goto onError;
8119 }
8120 }
8121 Py_XDECREF(errorHandler);
8122 Py_XDECREF(exc);
8123 return 0;
8124
8125onError:
8126 Py_XDECREF(item);
8127 Py_XDECREF(errorHandler);
8128 Py_XDECREF(exc);
8129 return -1;
8130}
8131
Alexander Belopolsky40018472011-02-26 01:02:56 +00008132PyObject *
8133PyUnicode_DecodeCharmap(const char *s,
8134 Py_ssize_t size,
8135 PyObject *mapping,
8136 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008137{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008138 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008139
Guido van Rossumd57fd912000-03-10 22:53:23 +00008140 /* Default to Latin-1 */
8141 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008142 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008143
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008145 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008146 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008147 writer.min_length = size;
8148 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008149 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008150
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008151 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008152 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8153 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008154 }
8155 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008156 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8157 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008158 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008159 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008160
Benjamin Peterson29060642009-01-31 22:14:21 +00008161 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008162 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008163 return NULL;
8164}
8165
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008166/* Charmap encoding: the lookup table */
8167
Alexander Belopolsky40018472011-02-26 01:02:56 +00008168struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008169 PyObject_HEAD
8170 unsigned char level1[32];
8171 int count2, count3;
8172 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008173};
8174
8175static PyObject*
8176encoding_map_size(PyObject *obj, PyObject* args)
8177{
8178 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008179 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008180 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181}
8182
8183static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008184 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008185 PyDoc_STR("Return the size (in bytes) of this object") },
8186 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008187};
8188
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008189static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008190 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008191 "EncodingMap", /*tp_name*/
8192 sizeof(struct encoding_map), /*tp_basicsize*/
8193 0, /*tp_itemsize*/
8194 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008195 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008196 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008197 0, /*tp_getattr*/
8198 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008199 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008200 0, /*tp_repr*/
8201 0, /*tp_as_number*/
8202 0, /*tp_as_sequence*/
8203 0, /*tp_as_mapping*/
8204 0, /*tp_hash*/
8205 0, /*tp_call*/
8206 0, /*tp_str*/
8207 0, /*tp_getattro*/
8208 0, /*tp_setattro*/
8209 0, /*tp_as_buffer*/
8210 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8211 0, /*tp_doc*/
8212 0, /*tp_traverse*/
8213 0, /*tp_clear*/
8214 0, /*tp_richcompare*/
8215 0, /*tp_weaklistoffset*/
8216 0, /*tp_iter*/
8217 0, /*tp_iternext*/
8218 encoding_map_methods, /*tp_methods*/
8219 0, /*tp_members*/
8220 0, /*tp_getset*/
8221 0, /*tp_base*/
8222 0, /*tp_dict*/
8223 0, /*tp_descr_get*/
8224 0, /*tp_descr_set*/
8225 0, /*tp_dictoffset*/
8226 0, /*tp_init*/
8227 0, /*tp_alloc*/
8228 0, /*tp_new*/
8229 0, /*tp_free*/
8230 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008231};
8232
8233PyObject*
8234PyUnicode_BuildEncodingMap(PyObject* string)
8235{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008236 PyObject *result;
8237 struct encoding_map *mresult;
8238 int i;
8239 int need_dict = 0;
8240 unsigned char level1[32];
8241 unsigned char level2[512];
8242 unsigned char *mlevel1, *mlevel2, *mlevel3;
8243 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008244 int kind;
8245 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008246 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008247 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008248
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008249 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250 PyErr_BadArgument();
8251 return NULL;
8252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008253 kind = PyUnicode_KIND(string);
8254 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008255 length = PyUnicode_GET_LENGTH(string);
8256 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008257 memset(level1, 0xFF, sizeof level1);
8258 memset(level2, 0xFF, sizeof level2);
8259
8260 /* If there isn't a one-to-one mapping of NULL to \0,
8261 or if there are non-BMP characters, we need to use
8262 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008263 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008265 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267 ch = PyUnicode_READ(kind, data, i);
8268 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008269 need_dict = 1;
8270 break;
8271 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008272 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273 /* unmapped character */
8274 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008275 l1 = ch >> 11;
8276 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008277 if (level1[l1] == 0xFF)
8278 level1[l1] = count2++;
8279 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008280 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281 }
8282
8283 if (count2 >= 0xFF || count3 >= 0xFF)
8284 need_dict = 1;
8285
8286 if (need_dict) {
8287 PyObject *result = PyDict_New();
8288 PyObject *key, *value;
8289 if (!result)
8290 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008291 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008292 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008293 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008294 if (!key || !value)
8295 goto failed1;
8296 if (PyDict_SetItem(result, key, value) == -1)
8297 goto failed1;
8298 Py_DECREF(key);
8299 Py_DECREF(value);
8300 }
8301 return result;
8302 failed1:
8303 Py_XDECREF(key);
8304 Py_XDECREF(value);
8305 Py_DECREF(result);
8306 return NULL;
8307 }
8308
8309 /* Create a three-level trie */
8310 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8311 16*count2 + 128*count3 - 1);
8312 if (!result)
8313 return PyErr_NoMemory();
8314 PyObject_Init(result, &EncodingMapType);
8315 mresult = (struct encoding_map*)result;
8316 mresult->count2 = count2;
8317 mresult->count3 = count3;
8318 mlevel1 = mresult->level1;
8319 mlevel2 = mresult->level23;
8320 mlevel3 = mresult->level23 + 16*count2;
8321 memcpy(mlevel1, level1, 32);
8322 memset(mlevel2, 0xFF, 16*count2);
8323 memset(mlevel3, 0, 128*count3);
8324 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008325 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008326 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008327 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8328 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008329 /* unmapped character */
8330 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008331 o1 = ch>>11;
8332 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 i2 = 16*mlevel1[o1] + o2;
8334 if (mlevel2[i2] == 0xFF)
8335 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008336 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 i3 = 128*mlevel2[i2] + o3;
8338 mlevel3[i3] = i;
8339 }
8340 return result;
8341}
8342
8343static int
Victor Stinner22168992011-11-20 17:09:18 +01008344encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008345{
8346 struct encoding_map *map = (struct encoding_map*)mapping;
8347 int l1 = c>>11;
8348 int l2 = (c>>7) & 0xF;
8349 int l3 = c & 0x7F;
8350 int i;
8351
Victor Stinner22168992011-11-20 17:09:18 +01008352 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008354 if (c == 0)
8355 return 0;
8356 /* level 1*/
8357 i = map->level1[l1];
8358 if (i == 0xFF) {
8359 return -1;
8360 }
8361 /* level 2*/
8362 i = map->level23[16*i+l2];
8363 if (i == 0xFF) {
8364 return -1;
8365 }
8366 /* level 3 */
8367 i = map->level23[16*map->count2 + 128*i + l3];
8368 if (i == 0) {
8369 return -1;
8370 }
8371 return i;
8372}
8373
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374/* Lookup the character ch in the mapping. If the character
8375 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008376 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008377static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008378charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008379{
Christian Heimes217cfd12007-12-02 14:31:20 +00008380 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 PyObject *x;
8382
8383 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 x = PyObject_GetItem(mapping, w);
8386 Py_DECREF(w);
8387 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8389 /* No mapping found means: mapping is undefined. */
8390 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008391 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 } else
8393 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008394 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008395 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008397 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008398 long value = PyLong_AS_LONG(x);
8399 if (value < 0 || value > 255) {
8400 PyErr_SetString(PyExc_TypeError,
8401 "character mapping must be in range(256)");
8402 Py_DECREF(x);
8403 return NULL;
8404 }
8405 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008406 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008407 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008408 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008409 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008410 /* wrong return value */
8411 PyErr_Format(PyExc_TypeError,
8412 "character mapping must return integer, bytes or None, not %.400s",
8413 x->ob_type->tp_name);
8414 Py_DECREF(x);
8415 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416 }
8417}
8418
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008419static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008420charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008421{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008422 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8423 /* exponentially overallocate to minimize reallocations */
8424 if (requiredsize < 2*outsize)
8425 requiredsize = 2*outsize;
8426 if (_PyBytes_Resize(outobj, requiredsize))
8427 return -1;
8428 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008429}
8430
Benjamin Peterson14339b62009-01-31 16:36:08 +00008431typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008432 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008433} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008434/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008435 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008436 space is available. Return a new reference to the object that
8437 was put in the output buffer, or Py_None, if the mapping was undefined
8438 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008439 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008440static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008441charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008442 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008444 PyObject *rep;
8445 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008446 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447
Christian Heimes90aa7642007-12-19 02:45:37 +00008448 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008449 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008451 if (res == -1)
8452 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008453 if (outsize<requiredsize)
8454 if (charmapencode_resize(outobj, outpos, requiredsize))
8455 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008456 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 outstart[(*outpos)++] = (char)res;
8458 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008459 }
8460
8461 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008462 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008463 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008464 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 Py_DECREF(rep);
8466 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008467 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008468 if (PyLong_Check(rep)) {
8469 Py_ssize_t requiredsize = *outpos+1;
8470 if (outsize<requiredsize)
8471 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8472 Py_DECREF(rep);
8473 return enc_EXCEPTION;
8474 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008475 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 else {
8479 const char *repchars = PyBytes_AS_STRING(rep);
8480 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8481 Py_ssize_t requiredsize = *outpos+repsize;
8482 if (outsize<requiredsize)
8483 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8484 Py_DECREF(rep);
8485 return enc_EXCEPTION;
8486 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008487 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008488 memcpy(outstart + *outpos, repchars, repsize);
8489 *outpos += repsize;
8490 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008491 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008492 Py_DECREF(rep);
8493 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008494}
8495
8496/* handle an error in PyUnicode_EncodeCharmap
8497 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008498static int
8499charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008500 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008501 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008502 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008503 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008504{
8505 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008506 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008507 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008508 enum PyUnicode_Kind kind;
8509 void *data;
8510 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008511 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008512 Py_ssize_t collstartpos = *inpos;
8513 Py_ssize_t collendpos = *inpos+1;
8514 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008515 const char *encoding = "charmap";
8516 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008517 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008518 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008519 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008520
Benjamin Petersonbac79492012-01-14 13:34:47 -05008521 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008522 return -1;
8523 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524 /* find all unencodable characters */
8525 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008526 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008527 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008528 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008529 val = encoding_map_lookup(ch, mapping);
8530 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008531 break;
8532 ++collendpos;
8533 continue;
8534 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008535
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008536 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8537 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008538 if (rep==NULL)
8539 return -1;
8540 else if (rep!=Py_None) {
8541 Py_DECREF(rep);
8542 break;
8543 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008546 }
8547 /* cache callback name lookup
8548 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008549 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008550 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008551
8552 switch (*error_handler) {
8553 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008554 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008555 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008556
8557 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008558 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008559 x = charmapencode_output('?', mapping, res, respos);
8560 if (x==enc_EXCEPTION) {
8561 return -1;
8562 }
8563 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008564 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008565 return -1;
8566 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008567 }
8568 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008569 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008570 *inpos = collendpos;
8571 break;
Victor Stinner50149202015-09-22 00:26:54 +02008572
8573 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 /* generate replacement (temporarily (mis)uses p) */
8575 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008576 char buffer[2+29+1+1];
8577 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008578 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008579 for (cp = buffer; *cp; ++cp) {
8580 x = charmapencode_output(*cp, mapping, res, respos);
8581 if (x==enc_EXCEPTION)
8582 return -1;
8583 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008584 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008585 return -1;
8586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008587 }
8588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008589 *inpos = collendpos;
8590 break;
Victor Stinner50149202015-09-22 00:26:54 +02008591
Benjamin Peterson14339b62009-01-31 16:36:08 +00008592 default:
Victor Stinner50149202015-09-22 00:26:54 +02008593 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008594 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008596 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008598 if (PyBytes_Check(repunicode)) {
8599 /* Directly copy bytes result to output. */
8600 Py_ssize_t outsize = PyBytes_Size(*res);
8601 Py_ssize_t requiredsize;
8602 repsize = PyBytes_Size(repunicode);
8603 requiredsize = *respos + repsize;
8604 if (requiredsize > outsize)
8605 /* Make room for all additional bytes. */
8606 if (charmapencode_resize(res, respos, requiredsize)) {
8607 Py_DECREF(repunicode);
8608 return -1;
8609 }
8610 memcpy(PyBytes_AsString(*res) + *respos,
8611 PyBytes_AsString(repunicode), repsize);
8612 *respos += repsize;
8613 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008614 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008615 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008616 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008617 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008618 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008619 Py_DECREF(repunicode);
8620 return -1;
8621 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008622 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008623 data = PyUnicode_DATA(repunicode);
8624 kind = PyUnicode_KIND(repunicode);
8625 for (index = 0; index < repsize; index++) {
8626 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8627 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008628 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008629 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008630 return -1;
8631 }
8632 else if (x==enc_FAILED) {
8633 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008634 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 return -1;
8636 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008637 }
8638 *inpos = newpos;
8639 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008640 }
8641 return 0;
8642}
8643
Alexander Belopolsky40018472011-02-26 01:02:56 +00008644PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008645_PyUnicode_EncodeCharmap(PyObject *unicode,
8646 PyObject *mapping,
8647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008648{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008649 /* output object */
8650 PyObject *res = NULL;
8651 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008652 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008653 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008654 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008655 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008656 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008657 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008658 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008659 void *data;
8660 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661
Benjamin Petersonbac79492012-01-14 13:34:47 -05008662 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008663 return NULL;
8664 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008665 data = PyUnicode_DATA(unicode);
8666 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008667
Guido van Rossumd57fd912000-03-10 22:53:23 +00008668 /* Default to Latin-1 */
8669 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008670 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008672 /* allocate enough for a simple encoding without
8673 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008674 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008675 if (res == NULL)
8676 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008677 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008678 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008680 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008681 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008683 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008684 if (x==enc_EXCEPTION) /* error */
8685 goto onError;
8686 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008687 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008689 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008690 &res, &respos)) {
8691 goto onError;
8692 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008693 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 else
8695 /* done with this character => adjust input position */
8696 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008697 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008700 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008701 if (_PyBytes_Resize(&res, respos) < 0)
8702 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008703
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008705 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706 return res;
8707
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 Py_XDECREF(res);
8710 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008711 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008712 return NULL;
8713}
8714
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008715/* Deprecated */
8716PyObject *
8717PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8718 Py_ssize_t size,
8719 PyObject *mapping,
8720 const char *errors)
8721{
8722 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008723 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008724 if (unicode == NULL)
8725 return NULL;
8726 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8727 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008728 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008729}
8730
Alexander Belopolsky40018472011-02-26 01:02:56 +00008731PyObject *
8732PyUnicode_AsCharmapString(PyObject *unicode,
8733 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734{
8735 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 PyErr_BadArgument();
8737 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008739 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008740}
8741
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008742/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008743static void
8744make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008745 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008746 Py_ssize_t startpos, Py_ssize_t endpos,
8747 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008748{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008750 *exceptionObject = _PyUnicodeTranslateError_Create(
8751 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752 }
8753 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8755 goto onError;
8756 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8757 goto onError;
8758 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8759 goto onError;
8760 return;
8761 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008762 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008763 }
8764}
8765
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008766/* error handling callback helper:
8767 build arguments, call the callback and check the arguments,
8768 put the result into newpos and return the replacement string, which
8769 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008770static PyObject *
8771unicode_translate_call_errorhandler(const char *errors,
8772 PyObject **errorHandler,
8773 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008774 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008775 Py_ssize_t startpos, Py_ssize_t endpos,
8776 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008777{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008778 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008780 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781 PyObject *restuple;
8782 PyObject *resunicode;
8783
8784 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008785 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008787 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008788 }
8789
8790 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008791 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008793 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008794
Jeroen Demeyer196a5302019-07-04 12:31:34 +02008795 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008796 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008798 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008799 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 Py_DECREF(restuple);
8801 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008803 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 &resunicode, &i_newpos)) {
8805 Py_DECREF(restuple);
8806 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008808 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008809 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008810 else
8811 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008812 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008813 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 Py_DECREF(restuple);
8815 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008816 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008817 Py_INCREF(resunicode);
8818 Py_DECREF(restuple);
8819 return resunicode;
8820}
8821
8822/* Lookup the character ch in the mapping and put the result in result,
8823 which must be decrefed by the caller.
8824 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008825static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008826charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008827{
Christian Heimes217cfd12007-12-02 14:31:20 +00008828 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 PyObject *x;
8830
8831 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008833 x = PyObject_GetItem(mapping, w);
8834 Py_DECREF(w);
8835 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8837 /* No mapping found means: use 1:1 mapping. */
8838 PyErr_Clear();
8839 *result = NULL;
8840 return 0;
8841 } else
8842 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008843 }
8844 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008845 *result = x;
8846 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008847 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008848 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008850 if (value < 0 || value > MAX_UNICODE) {
8851 PyErr_Format(PyExc_ValueError,
8852 "character mapping must be in range(0x%x)",
8853 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008854 Py_DECREF(x);
8855 return -1;
8856 }
8857 *result = x;
8858 return 0;
8859 }
8860 else if (PyUnicode_Check(x)) {
8861 *result = x;
8862 return 0;
8863 }
8864 else {
8865 /* wrong return value */
8866 PyErr_SetString(PyExc_TypeError,
8867 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008868 Py_DECREF(x);
8869 return -1;
8870 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008871}
Victor Stinner1194ea02014-04-04 19:37:40 +02008872
8873/* lookup the character, write the result into the writer.
8874 Return 1 if the result was written into the writer, return 0 if the mapping
8875 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008876static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008877charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8878 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008879{
Victor Stinner1194ea02014-04-04 19:37:40 +02008880 PyObject *item;
8881
8882 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008883 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008884
8885 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008887 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008888 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008889 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008890 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008891 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008892
8893 if (item == Py_None) {
8894 Py_DECREF(item);
8895 return 0;
8896 }
8897
8898 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008899 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8900 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8901 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008902 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8903 Py_DECREF(item);
8904 return -1;
8905 }
8906 Py_DECREF(item);
8907 return 1;
8908 }
8909
8910 if (!PyUnicode_Check(item)) {
8911 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008912 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008913 }
8914
8915 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8916 Py_DECREF(item);
8917 return -1;
8918 }
8919
8920 Py_DECREF(item);
8921 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008922}
8923
Victor Stinner89a76ab2014-04-05 11:44:04 +02008924static int
8925unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8926 Py_UCS1 *translate)
8927{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008928 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008929 int ret = 0;
8930
Victor Stinner89a76ab2014-04-05 11:44:04 +02008931 if (charmaptranslate_lookup(ch, mapping, &item)) {
8932 return -1;
8933 }
8934
8935 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008936 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008937 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008938 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008939 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008940 /* not found => default to 1:1 mapping */
8941 translate[ch] = ch;
8942 return 1;
8943 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008944 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008945 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008946 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8947 used it */
8948 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008949 /* invalid character or character outside ASCII:
8950 skip the fast translate */
8951 goto exit;
8952 }
8953 translate[ch] = (Py_UCS1)replace;
8954 }
8955 else if (PyUnicode_Check(item)) {
8956 Py_UCS4 replace;
8957
8958 if (PyUnicode_READY(item) == -1) {
8959 Py_DECREF(item);
8960 return -1;
8961 }
8962 if (PyUnicode_GET_LENGTH(item) != 1)
8963 goto exit;
8964
8965 replace = PyUnicode_READ_CHAR(item, 0);
8966 if (replace > 127)
8967 goto exit;
8968 translate[ch] = (Py_UCS1)replace;
8969 }
8970 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008971 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008972 goto exit;
8973 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008974 ret = 1;
8975
Benjamin Peterson1365de72014-04-07 20:15:41 -04008976 exit:
8977 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978 return ret;
8979}
8980
8981/* Fast path for ascii => ascii translation. Return 1 if the whole string
8982 was translated into writer, return 0 if the input string was partially
8983 translated into writer, raise an exception and return -1 on error. */
8984static int
8985unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008986 _PyUnicodeWriter *writer, int ignore,
8987 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008988{
Victor Stinner872b2912014-04-05 14:27:07 +02008989 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008990 Py_ssize_t len;
8991 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008992 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008993
Victor Stinner89a76ab2014-04-05 11:44:04 +02008994 len = PyUnicode_GET_LENGTH(input);
8995
Victor Stinner872b2912014-04-05 14:27:07 +02008996 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008997
8998 in = PyUnicode_1BYTE_DATA(input);
8999 end = in + len;
9000
9001 assert(PyUnicode_IS_ASCII(writer->buffer));
9002 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9003 out = PyUnicode_1BYTE_DATA(writer->buffer);
9004
Victor Stinner872b2912014-04-05 14:27:07 +02009005 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009006 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009007 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009009 int translate = unicode_fast_translate_lookup(mapping, ch,
9010 ascii_table);
9011 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009012 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009013 if (translate == 0)
9014 goto exit;
9015 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009016 }
Victor Stinner872b2912014-04-05 14:27:07 +02009017 if (ch2 == 0xfe) {
9018 if (ignore)
9019 continue;
9020 goto exit;
9021 }
9022 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009023 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009024 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009025 }
Victor Stinner872b2912014-04-05 14:27:07 +02009026 res = 1;
9027
9028exit:
9029 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009030 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009031 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009032}
9033
Victor Stinner3222da22015-10-01 22:07:32 +02009034static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009035_PyUnicode_TranslateCharmap(PyObject *input,
9036 PyObject *mapping,
9037 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009038{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009040 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009041 Py_ssize_t size, i;
9042 int kind;
9043 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009044 _PyUnicodeWriter writer;
9045 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009046 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009047 PyObject *errorHandler = NULL;
9048 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009049 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009050 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009051
Guido van Rossumd57fd912000-03-10 22:53:23 +00009052 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009053 PyErr_BadArgument();
9054 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009055 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009056
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009057 if (PyUnicode_READY(input) == -1)
9058 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009060 kind = PyUnicode_KIND(input);
9061 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009062
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009063 if (size == 0)
9064 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009065
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009066 /* allocate enough for a simple 1:1 translation without
9067 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009068 _PyUnicodeWriter_Init(&writer);
9069 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009070 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071
Victor Stinner872b2912014-04-05 14:27:07 +02009072 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9073
Victor Stinner33798672016-03-01 21:59:58 +01009074 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009075 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009076 if (PyUnicode_IS_ASCII(input)) {
9077 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9078 if (res < 0) {
9079 _PyUnicodeWriter_Dealloc(&writer);
9080 return NULL;
9081 }
9082 if (res == 1)
9083 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009084 }
Victor Stinner33798672016-03-01 21:59:58 +01009085 else {
9086 i = 0;
9087 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009088
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009089 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009091 int translate;
9092 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9093 Py_ssize_t newpos;
9094 /* startpos for collecting untranslatable chars */
9095 Py_ssize_t collstart;
9096 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009097 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098
Victor Stinner1194ea02014-04-04 19:37:40 +02009099 ch = PyUnicode_READ(kind, data, i);
9100 translate = charmaptranslate_output(ch, mapping, &writer);
9101 if (translate < 0)
9102 goto onError;
9103
9104 if (translate != 0) {
9105 /* it worked => adjust input pointer */
9106 ++i;
9107 continue;
9108 }
9109
9110 /* untranslatable character */
9111 collstart = i;
9112 collend = i+1;
9113
9114 /* find all untranslatable characters */
9115 while (collend < size) {
9116 PyObject *x;
9117 ch = PyUnicode_READ(kind, data, collend);
9118 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009119 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009120 Py_XDECREF(x);
9121 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009122 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009123 ++collend;
9124 }
9125
9126 if (ignore) {
9127 i = collend;
9128 }
9129 else {
9130 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9131 reason, input, &exc,
9132 collstart, collend, &newpos);
9133 if (repunicode == NULL)
9134 goto onError;
9135 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009136 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009137 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009138 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009139 Py_DECREF(repunicode);
9140 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009141 }
9142 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009143 Py_XDECREF(exc);
9144 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009145 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009146
Benjamin Peterson29060642009-01-31 22:14:21 +00009147 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009148 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009149 Py_XDECREF(exc);
9150 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009151 return NULL;
9152}
9153
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009154/* Deprecated. Use PyUnicode_Translate instead. */
9155PyObject *
9156PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9157 Py_ssize_t size,
9158 PyObject *mapping,
9159 const char *errors)
9160{
Christian Heimes5f520f42012-09-11 14:03:25 +02009161 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009162 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009163 if (!unicode)
9164 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009165 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9166 Py_DECREF(unicode);
9167 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009168}
9169
Alexander Belopolsky40018472011-02-26 01:02:56 +00009170PyObject *
9171PyUnicode_Translate(PyObject *str,
9172 PyObject *mapping,
9173 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009174{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009175 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009176 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009177 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178}
Tim Petersced69f82003-09-16 20:30:58 +00009179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009180PyObject *
9181_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9182{
9183 if (!PyUnicode_Check(unicode)) {
9184 PyErr_BadInternalCall();
9185 return NULL;
9186 }
9187 if (PyUnicode_READY(unicode) == -1)
9188 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009189 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009190 /* If the string is already ASCII, just return the same string */
9191 Py_INCREF(unicode);
9192 return unicode;
9193 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009194
9195 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9196 PyObject *result = PyUnicode_New(len, 127);
9197 if (result == NULL) {
9198 return NULL;
9199 }
9200
9201 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9202 int kind = PyUnicode_KIND(unicode);
9203 const void *data = PyUnicode_DATA(unicode);
9204 Py_ssize_t i;
9205 for (i = 0; i < len; ++i) {
9206 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9207 if (ch < 127) {
9208 out[i] = ch;
9209 }
9210 else if (Py_UNICODE_ISSPACE(ch)) {
9211 out[i] = ' ';
9212 }
9213 else {
9214 int decimal = Py_UNICODE_TODECIMAL(ch);
9215 if (decimal < 0) {
9216 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009217 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009218 _PyUnicode_LENGTH(result) = i + 1;
9219 break;
9220 }
9221 out[i] = '0' + decimal;
9222 }
9223 }
9224
INADA Naoki16dfca42018-07-14 12:06:43 +09009225 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009226 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227}
9228
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009229PyObject *
9230PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9231 Py_ssize_t length)
9232{
Victor Stinnerf0124502011-11-21 23:12:56 +01009233 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009234 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009235 Py_UCS4 maxchar;
9236 enum PyUnicode_Kind kind;
9237 void *data;
9238
Victor Stinner99d7ad02012-02-22 13:37:39 +01009239 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009240 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009241 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009242 if (ch > 127) {
9243 int decimal = Py_UNICODE_TODECIMAL(ch);
9244 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009245 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009246 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009247 }
9248 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009249
9250 /* Copy to a new string */
9251 decimal = PyUnicode_New(length, maxchar);
9252 if (decimal == NULL)
9253 return decimal;
9254 kind = PyUnicode_KIND(decimal);
9255 data = PyUnicode_DATA(decimal);
9256 /* Iterate over code points */
9257 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009258 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009259 if (ch > 127) {
9260 int decimal = Py_UNICODE_TODECIMAL(ch);
9261 if (decimal >= 0)
9262 ch = '0' + decimal;
9263 }
9264 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009265 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009266 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009267}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009268/* --- Decimal Encoder ---------------------------------------------------- */
9269
Alexander Belopolsky40018472011-02-26 01:02:56 +00009270int
9271PyUnicode_EncodeDecimal(Py_UNICODE *s,
9272 Py_ssize_t length,
9273 char *output,
9274 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009275{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009276 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009277 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009278 enum PyUnicode_Kind kind;
9279 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009280
9281 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009282 PyErr_BadArgument();
9283 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009284 }
9285
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009286 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009287 if (unicode == NULL)
9288 return -1;
9289
Victor Stinner42bf7752011-11-21 22:52:58 +01009290 kind = PyUnicode_KIND(unicode);
9291 data = PyUnicode_DATA(unicode);
9292
Victor Stinnerb84d7232011-11-22 01:50:07 +01009293 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009294 PyObject *exc;
9295 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009296 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009297 Py_ssize_t startpos;
9298
9299 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009300
Benjamin Peterson29060642009-01-31 22:14:21 +00009301 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009302 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009303 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009304 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009305 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009306 decimal = Py_UNICODE_TODECIMAL(ch);
9307 if (decimal >= 0) {
9308 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009309 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 continue;
9311 }
9312 if (0 < ch && ch < 256) {
9313 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009314 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009315 continue;
9316 }
Victor Stinner6345be92011-11-25 20:09:01 +01009317
Victor Stinner42bf7752011-11-21 22:52:58 +01009318 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009319 exc = NULL;
9320 raise_encode_exception(&exc, "decimal", unicode,
9321 startpos, startpos+1,
9322 "invalid decimal Unicode string");
9323 Py_XDECREF(exc);
9324 Py_DECREF(unicode);
9325 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009326 }
9327 /* 0-terminate the output string */
9328 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009329 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009330 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009331}
9332
Guido van Rossumd57fd912000-03-10 22:53:23 +00009333/* --- Helpers ------------------------------------------------------------ */
9334
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009335/* helper macro to fixup start/end slice values */
9336#define ADJUST_INDICES(start, end, len) \
9337 if (end > len) \
9338 end = len; \
9339 else if (end < 0) { \
9340 end += len; \
9341 if (end < 0) \
9342 end = 0; \
9343 } \
9344 if (start < 0) { \
9345 start += len; \
9346 if (start < 0) \
9347 start = 0; \
9348 }
9349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009350static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009351any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009353 Py_ssize_t end,
9354 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009356 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009357 void *buf1, *buf2;
9358 Py_ssize_t len1, len2, result;
9359
9360 kind1 = PyUnicode_KIND(s1);
9361 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009362 if (kind1 < kind2)
9363 return -1;
9364
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009365 len1 = PyUnicode_GET_LENGTH(s1);
9366 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009367 ADJUST_INDICES(start, end, len1);
9368 if (end - start < len2)
9369 return -1;
9370
9371 buf1 = PyUnicode_DATA(s1);
9372 buf2 = PyUnicode_DATA(s2);
9373 if (len2 == 1) {
9374 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9375 result = findchar((const char *)buf1 + kind1*start,
9376 kind1, end - start, ch, direction);
9377 if (result == -1)
9378 return -1;
9379 else
9380 return start + result;
9381 }
9382
9383 if (kind2 != kind1) {
9384 buf2 = _PyUnicode_AsKind(s2, kind1);
9385 if (!buf2)
9386 return -2;
9387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009388
Victor Stinner794d5672011-10-10 03:21:36 +02009389 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009390 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009391 case PyUnicode_1BYTE_KIND:
9392 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9393 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9394 else
9395 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9396 break;
9397 case PyUnicode_2BYTE_KIND:
9398 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9399 break;
9400 case PyUnicode_4BYTE_KIND:
9401 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9402 break;
9403 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009404 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009405 }
9406 }
9407 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009408 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009409 case PyUnicode_1BYTE_KIND:
9410 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9411 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9412 else
9413 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9414 break;
9415 case PyUnicode_2BYTE_KIND:
9416 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9417 break;
9418 case PyUnicode_4BYTE_KIND:
9419 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9420 break;
9421 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009422 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009423 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009424 }
9425
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009426 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009427 PyMem_Free(buf2);
9428
9429 return result;
9430}
9431
Victor Stinner59423e32018-11-26 13:40:01 +01009432/* _PyUnicode_InsertThousandsGrouping() helper functions */
9433#include "stringlib/localeutil.h"
9434
9435/**
9436 * InsertThousandsGrouping:
9437 * @writer: Unicode writer.
9438 * @n_buffer: Number of characters in @buffer.
9439 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9440 * @d_pos: Start of digits string.
9441 * @n_digits: The number of digits in the string, in which we want
9442 * to put the grouping chars.
9443 * @min_width: The minimum width of the digits in the output string.
9444 * Output will be zero-padded on the left to fill.
9445 * @grouping: see definition in localeconv().
9446 * @thousands_sep: see definition in localeconv().
9447 *
9448 * There are 2 modes: counting and filling. If @writer is NULL,
9449 * we are in counting mode, else filling mode.
9450 * If counting, the required buffer size is returned.
9451 * If filling, we know the buffer will be large enough, so we don't
9452 * need to pass in the buffer size.
9453 * Inserts thousand grouping characters (as defined by grouping and
9454 * thousands_sep) into @writer.
9455 *
9456 * Return value: -1 on error, number of characters otherwise.
9457 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009458Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009459_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009460 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009461 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009462 PyObject *digits,
9463 Py_ssize_t d_pos,
9464 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009465 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009466 const char *grouping,
9467 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009468 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009469{
Xtreak3f7983a2019-01-07 20:39:14 +05309470 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009471 if (writer) {
9472 assert(digits != NULL);
9473 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009474 }
9475 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009476 assert(digits == NULL);
9477 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009478 }
Victor Stinner59423e32018-11-26 13:40:01 +01009479 assert(0 <= d_pos);
9480 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009481 assert(grouping != NULL);
9482
9483 if (digits != NULL) {
9484 if (PyUnicode_READY(digits) == -1) {
9485 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009486 }
Victor Stinner59423e32018-11-26 13:40:01 +01009487 }
9488 if (PyUnicode_READY(thousands_sep) == -1) {
9489 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009490 }
9491
Victor Stinner59423e32018-11-26 13:40:01 +01009492 Py_ssize_t count = 0;
9493 Py_ssize_t n_zeros;
9494 int loop_broken = 0;
9495 int use_separator = 0; /* First time through, don't append the
9496 separator. They only go between
9497 groups. */
9498 Py_ssize_t buffer_pos;
9499 Py_ssize_t digits_pos;
9500 Py_ssize_t len;
9501 Py_ssize_t n_chars;
9502 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9503 be looked at */
9504 /* A generator that returns all of the grouping widths, until it
9505 returns 0. */
9506 GroupGenerator groupgen;
9507 GroupGenerator_init(&groupgen, grouping);
9508 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9509
9510 /* if digits are not grouped, thousands separator
9511 should be an empty string */
9512 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9513
9514 digits_pos = d_pos + n_digits;
9515 if (writer) {
9516 buffer_pos = writer->pos + n_buffer;
9517 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9518 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 }
Victor Stinner59423e32018-11-26 13:40:01 +01009520 else {
9521 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009522 }
Victor Stinner59423e32018-11-26 13:40:01 +01009523
9524 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009525 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009526 }
Victor Stinner59423e32018-11-26 13:40:01 +01009527
9528 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9529 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9530 n_zeros = Py_MAX(0, len - remaining);
9531 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9532
9533 /* Use n_zero zero's and n_chars chars */
9534
9535 /* Count only, don't do anything. */
9536 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9537
9538 /* Copy into the writer. */
9539 InsertThousandsGrouping_fill(writer, &buffer_pos,
9540 digits, &digits_pos,
9541 n_chars, n_zeros,
9542 use_separator ? thousands_sep : NULL,
9543 thousands_sep_len, maxchar);
9544
9545 /* Use a separator next time. */
9546 use_separator = 1;
9547
9548 remaining -= n_chars;
9549 min_width -= len;
9550
9551 if (remaining <= 0 && min_width <= 0) {
9552 loop_broken = 1;
9553 break;
9554 }
9555 min_width -= thousands_sep_len;
9556 }
9557 if (!loop_broken) {
9558 /* We left the loop without using a break statement. */
9559
9560 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9561 n_zeros = Py_MAX(0, len - remaining);
9562 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9563
9564 /* Use n_zero zero's and n_chars chars */
9565 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9566
9567 /* Copy into the writer. */
9568 InsertThousandsGrouping_fill(writer, &buffer_pos,
9569 digits, &digits_pos,
9570 n_chars, n_zeros,
9571 use_separator ? thousands_sep : NULL,
9572 thousands_sep_len, maxchar);
9573 }
9574 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009575}
9576
9577
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578Py_ssize_t
9579PyUnicode_Count(PyObject *str,
9580 PyObject *substr,
9581 Py_ssize_t start,
9582 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009584 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009585 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 void *buf1 = NULL, *buf2 = NULL;
9587 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009588
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009589 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009590 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009591
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009592 kind1 = PyUnicode_KIND(str);
9593 kind2 = PyUnicode_KIND(substr);
9594 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009595 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009596
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009597 len1 = PyUnicode_GET_LENGTH(str);
9598 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009599 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009600 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009601 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009602
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009603 buf1 = PyUnicode_DATA(str);
9604 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009605 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009606 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009607 if (!buf2)
9608 goto onError;
9609 }
9610
9611 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009613 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009614 result = asciilib_count(
9615 ((Py_UCS1*)buf1) + start, end - start,
9616 buf2, len2, PY_SSIZE_T_MAX
9617 );
9618 else
9619 result = ucs1lib_count(
9620 ((Py_UCS1*)buf1) + start, end - start,
9621 buf2, len2, PY_SSIZE_T_MAX
9622 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 break;
9624 case PyUnicode_2BYTE_KIND:
9625 result = ucs2lib_count(
9626 ((Py_UCS2*)buf1) + start, end - start,
9627 buf2, len2, PY_SSIZE_T_MAX
9628 );
9629 break;
9630 case PyUnicode_4BYTE_KIND:
9631 result = ucs4lib_count(
9632 ((Py_UCS4*)buf1) + start, end - start,
9633 buf2, len2, PY_SSIZE_T_MAX
9634 );
9635 break;
9636 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009637 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009638 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009639
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009640 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009641 PyMem_Free(buf2);
9642
Guido van Rossumd57fd912000-03-10 22:53:23 +00009643 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009644 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009645 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009646 PyMem_Free(buf2);
9647 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009648}
9649
Alexander Belopolsky40018472011-02-26 01:02:56 +00009650Py_ssize_t
9651PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009652 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009653 Py_ssize_t start,
9654 Py_ssize_t end,
9655 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009656{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009657 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009658 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009659
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009660 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661}
9662
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009663Py_ssize_t
9664PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9665 Py_ssize_t start, Py_ssize_t end,
9666 int direction)
9667{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009669 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009670 if (PyUnicode_READY(str) == -1)
9671 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009672 len = PyUnicode_GET_LENGTH(str);
9673 ADJUST_INDICES(start, end, len);
9674 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009675 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009676 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009677 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9678 kind, end-start, ch, direction);
9679 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009681 else
9682 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683}
9684
Alexander Belopolsky40018472011-02-26 01:02:56 +00009685static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009686tailmatch(PyObject *self,
9687 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009688 Py_ssize_t start,
9689 Py_ssize_t end,
9690 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 int kind_self;
9693 int kind_sub;
9694 void *data_self;
9695 void *data_sub;
9696 Py_ssize_t offset;
9697 Py_ssize_t i;
9698 Py_ssize_t end_sub;
9699
9700 if (PyUnicode_READY(self) == -1 ||
9701 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009702 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009704 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9705 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009706 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009707 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009708
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009709 if (PyUnicode_GET_LENGTH(substring) == 0)
9710 return 1;
9711
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009712 kind_self = PyUnicode_KIND(self);
9713 data_self = PyUnicode_DATA(self);
9714 kind_sub = PyUnicode_KIND(substring);
9715 data_sub = PyUnicode_DATA(substring);
9716 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9717
9718 if (direction > 0)
9719 offset = end;
9720 else
9721 offset = start;
9722
9723 if (PyUnicode_READ(kind_self, data_self, offset) ==
9724 PyUnicode_READ(kind_sub, data_sub, 0) &&
9725 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9726 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9727 /* If both are of the same kind, memcmp is sufficient */
9728 if (kind_self == kind_sub) {
9729 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009730 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009731 data_sub,
9732 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009733 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009734 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009735 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009736 else {
9737 /* We do not need to compare 0 and len(substring)-1 because
9738 the if statement above ensured already that they are equal
9739 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 for (i = 1; i < end_sub; ++i) {
9741 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9742 PyUnicode_READ(kind_sub, data_sub, i))
9743 return 0;
9744 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009745 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009747 }
9748
9749 return 0;
9750}
9751
Alexander Belopolsky40018472011-02-26 01:02:56 +00009752Py_ssize_t
9753PyUnicode_Tailmatch(PyObject *str,
9754 PyObject *substr,
9755 Py_ssize_t start,
9756 Py_ssize_t end,
9757 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009759 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009760 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009761
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009762 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009763}
9764
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009765static PyObject *
9766ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009768 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9769 char *resdata, *data = PyUnicode_DATA(self);
9770 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009771
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009772 res = PyUnicode_New(len, 127);
9773 if (res == NULL)
9774 return NULL;
9775 resdata = PyUnicode_DATA(res);
9776 if (lower)
9777 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009778 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009779 _Py_bytes_upper(resdata, data, len);
9780 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009781}
9782
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009784handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009786 Py_ssize_t j;
9787 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009788 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009789 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009790
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009791 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9792
9793 where ! is a negation and \p{xxx} is a character with property xxx.
9794 */
9795 for (j = i - 1; j >= 0; j--) {
9796 c = PyUnicode_READ(kind, data, j);
9797 if (!_PyUnicode_IsCaseIgnorable(c))
9798 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009799 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009800 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9801 if (final_sigma) {
9802 for (j = i + 1; j < length; j++) {
9803 c = PyUnicode_READ(kind, data, j);
9804 if (!_PyUnicode_IsCaseIgnorable(c))
9805 break;
9806 }
9807 final_sigma = j == length || !_PyUnicode_IsCased(c);
9808 }
9809 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009810}
9811
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009812static int
9813lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9814 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009815{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 /* Obscure special case. */
9817 if (c == 0x3A3) {
9818 mapped[0] = handle_capital_sigma(kind, data, length, i);
9819 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009820 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009821 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822}
9823
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009824static Py_ssize_t
9825do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009827 Py_ssize_t i, k = 0;
9828 int n_res, j;
9829 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009830
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009831 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009832 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009833 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009834 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009835 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837 for (i = 1; i < length; i++) {
9838 c = PyUnicode_READ(kind, data, i);
9839 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9840 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009841 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009842 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009843 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009844 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009845 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009846}
9847
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009848static Py_ssize_t
9849do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9850 Py_ssize_t i, k = 0;
9851
9852 for (i = 0; i < length; i++) {
9853 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9854 int n_res, j;
9855 if (Py_UNICODE_ISUPPER(c)) {
9856 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9857 }
9858 else if (Py_UNICODE_ISLOWER(c)) {
9859 n_res = _PyUnicode_ToUpperFull(c, mapped);
9860 }
9861 else {
9862 n_res = 1;
9863 mapped[0] = c;
9864 }
9865 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009866 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009867 res[k++] = mapped[j];
9868 }
9869 }
9870 return k;
9871}
9872
9873static Py_ssize_t
9874do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9875 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009876{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009877 Py_ssize_t i, k = 0;
9878
9879 for (i = 0; i < length; i++) {
9880 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9881 int n_res, j;
9882 if (lower)
9883 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9884 else
9885 n_res = _PyUnicode_ToUpperFull(c, mapped);
9886 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009887 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009888 res[k++] = mapped[j];
9889 }
9890 }
9891 return k;
9892}
9893
9894static Py_ssize_t
9895do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9896{
9897 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9898}
9899
9900static Py_ssize_t
9901do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9902{
9903 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9904}
9905
Benjamin Petersone51757f2012-01-12 21:10:29 -05009906static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009907do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9908{
9909 Py_ssize_t i, k = 0;
9910
9911 for (i = 0; i < length; i++) {
9912 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9913 Py_UCS4 mapped[3];
9914 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9915 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009916 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009917 res[k++] = mapped[j];
9918 }
9919 }
9920 return k;
9921}
9922
9923static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009924do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9925{
9926 Py_ssize_t i, k = 0;
9927 int previous_is_cased;
9928
9929 previous_is_cased = 0;
9930 for (i = 0; i < length; i++) {
9931 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9932 Py_UCS4 mapped[3];
9933 int n_res, j;
9934
9935 if (previous_is_cased)
9936 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9937 else
9938 n_res = _PyUnicode_ToTitleFull(c, mapped);
9939
9940 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009941 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009942 res[k++] = mapped[j];
9943 }
9944
9945 previous_is_cased = _PyUnicode_IsCased(c);
9946 }
9947 return k;
9948}
9949
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009950static PyObject *
9951case_operation(PyObject *self,
9952 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9953{
9954 PyObject *res = NULL;
9955 Py_ssize_t length, newlength = 0;
9956 int kind, outkind;
9957 void *data, *outdata;
9958 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9959
Benjamin Petersoneea48462012-01-16 14:28:50 -05009960 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009961
9962 kind = PyUnicode_KIND(self);
9963 data = PyUnicode_DATA(self);
9964 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009965 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009966 PyErr_SetString(PyExc_OverflowError, "string is too long");
9967 return NULL;
9968 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009969 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009970 if (tmp == NULL)
9971 return PyErr_NoMemory();
9972 newlength = perform(kind, data, length, tmp, &maxchar);
9973 res = PyUnicode_New(newlength, maxchar);
9974 if (res == NULL)
9975 goto leave;
9976 tmpend = tmp + newlength;
9977 outdata = PyUnicode_DATA(res);
9978 outkind = PyUnicode_KIND(res);
9979 switch (outkind) {
9980 case PyUnicode_1BYTE_KIND:
9981 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9982 break;
9983 case PyUnicode_2BYTE_KIND:
9984 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9985 break;
9986 case PyUnicode_4BYTE_KIND:
9987 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9988 break;
9989 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009990 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009991 }
9992 leave:
9993 PyMem_FREE(tmp);
9994 return res;
9995}
9996
Tim Peters8ce9f162004-08-27 01:49:32 +00009997PyObject *
9998PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009999{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010000 PyObject *res;
10001 PyObject *fseq;
10002 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010003 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010004
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010005 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010006 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010007 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010008 }
10009
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010010 /* NOTE: the following code can't call back into Python code,
10011 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010012 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010013
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010014 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010015 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010016 res = _PyUnicode_JoinArray(separator, items, seqlen);
10017 Py_DECREF(fseq);
10018 return res;
10019}
10020
10021PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010022_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010023{
10024 PyObject *res = NULL; /* the result */
10025 PyObject *sep = NULL;
10026 Py_ssize_t seplen;
10027 PyObject *item;
10028 Py_ssize_t sz, i, res_offset;
10029 Py_UCS4 maxchar;
10030 Py_UCS4 item_maxchar;
10031 int use_memcpy;
10032 unsigned char *res_data = NULL, *sep_data = NULL;
10033 PyObject *last_obj;
10034 unsigned int kind = 0;
10035
Tim Peters05eba1f2004-08-27 21:32:02 +000010036 /* If empty sequence, return u"". */
10037 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010038 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010039 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010040
Tim Peters05eba1f2004-08-27 21:32:02 +000010041 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010042 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010043 if (seqlen == 1) {
10044 if (PyUnicode_CheckExact(items[0])) {
10045 res = items[0];
10046 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010047 return res;
10048 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010049 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010050 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010051 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010052 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010053 /* Set up sep and seplen */
10054 if (separator == NULL) {
10055 /* fall back to a blank space separator */
10056 sep = PyUnicode_FromOrdinal(' ');
10057 if (!sep)
10058 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010059 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010060 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010061 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010062 else {
10063 if (!PyUnicode_Check(separator)) {
10064 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010065 "separator: expected str instance,"
10066 " %.80s found",
10067 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010068 goto onError;
10069 }
10070 if (PyUnicode_READY(separator))
10071 goto onError;
10072 sep = separator;
10073 seplen = PyUnicode_GET_LENGTH(separator);
10074 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10075 /* inc refcount to keep this code path symmetric with the
10076 above case of a blank separator */
10077 Py_INCREF(sep);
10078 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010079 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010080 }
10081
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010082 /* There are at least two things to join, or else we have a subclass
10083 * of str in the sequence.
10084 * Do a pre-pass to figure out the total amount of space we'll
10085 * need (sz), and see whether all argument are strings.
10086 */
10087 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010088#ifdef Py_DEBUG
10089 use_memcpy = 0;
10090#else
10091 use_memcpy = 1;
10092#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010093 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010094 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010095 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010096 if (!PyUnicode_Check(item)) {
10097 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010098 "sequence item %zd: expected str instance,"
10099 " %.80s found",
10100 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 goto onError;
10102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 if (PyUnicode_READY(item) == -1)
10104 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010105 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010107 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010108 if (i != 0) {
10109 add_sz += seplen;
10110 }
10111 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010112 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010113 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010114 goto onError;
10115 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010116 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010117 if (use_memcpy && last_obj != NULL) {
10118 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10119 use_memcpy = 0;
10120 }
10121 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010122 }
Tim Petersced69f82003-09-16 20:30:58 +000010123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010124 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010125 if (res == NULL)
10126 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010127
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010128 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010129#ifdef Py_DEBUG
10130 use_memcpy = 0;
10131#else
10132 if (use_memcpy) {
10133 res_data = PyUnicode_1BYTE_DATA(res);
10134 kind = PyUnicode_KIND(res);
10135 if (seplen != 0)
10136 sep_data = PyUnicode_1BYTE_DATA(sep);
10137 }
10138#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010139 if (use_memcpy) {
10140 for (i = 0; i < seqlen; ++i) {
10141 Py_ssize_t itemlen;
10142 item = items[i];
10143
10144 /* Copy item, and maybe the separator. */
10145 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010146 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010147 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010148 kind * seplen);
10149 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010150 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010151
10152 itemlen = PyUnicode_GET_LENGTH(item);
10153 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010154 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010155 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010156 kind * itemlen);
10157 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010158 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010159 }
10160 assert(res_data == PyUnicode_1BYTE_DATA(res)
10161 + kind * PyUnicode_GET_LENGTH(res));
10162 }
10163 else {
10164 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10165 Py_ssize_t itemlen;
10166 item = items[i];
10167
10168 /* Copy item, and maybe the separator. */
10169 if (i && seplen != 0) {
10170 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10171 res_offset += seplen;
10172 }
10173
10174 itemlen = PyUnicode_GET_LENGTH(item);
10175 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010176 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010177 res_offset += itemlen;
10178 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010179 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010180 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010181 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010184 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010185 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010186
Benjamin Peterson29060642009-01-31 22:14:21 +000010187 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010189 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190 return NULL;
10191}
10192
Victor Stinnerd3f08822012-05-29 12:57:52 +020010193void
10194_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10195 Py_UCS4 fill_char)
10196{
10197 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010198 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010199 assert(PyUnicode_IS_READY(unicode));
10200 assert(unicode_modifiable(unicode));
10201 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10202 assert(start >= 0);
10203 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010204 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010205}
10206
Victor Stinner3fe55312012-01-04 00:33:50 +010010207Py_ssize_t
10208PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10209 Py_UCS4 fill_char)
10210{
10211 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010212
10213 if (!PyUnicode_Check(unicode)) {
10214 PyErr_BadInternalCall();
10215 return -1;
10216 }
10217 if (PyUnicode_READY(unicode) == -1)
10218 return -1;
10219 if (unicode_check_modifiable(unicode))
10220 return -1;
10221
Victor Stinnerd3f08822012-05-29 12:57:52 +020010222 if (start < 0) {
10223 PyErr_SetString(PyExc_IndexError, "string index out of range");
10224 return -1;
10225 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010226 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10227 PyErr_SetString(PyExc_ValueError,
10228 "fill character is bigger than "
10229 "the string maximum character");
10230 return -1;
10231 }
10232
10233 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10234 length = Py_MIN(maxlen, length);
10235 if (length <= 0)
10236 return 0;
10237
Victor Stinnerd3f08822012-05-29 12:57:52 +020010238 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010239 return length;
10240}
10241
Victor Stinner9310abb2011-10-05 00:59:23 +020010242static PyObject *
10243pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010244 Py_ssize_t left,
10245 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 PyObject *u;
10249 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010250 int kind;
10251 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252
10253 if (left < 0)
10254 left = 0;
10255 if (right < 0)
10256 right = 0;
10257
Victor Stinnerc4b49542011-12-11 22:44:26 +010010258 if (left == 0 && right == 0)
10259 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010260
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10262 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010263 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10264 return NULL;
10265 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010266 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010267 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010269 if (!u)
10270 return NULL;
10271
10272 kind = PyUnicode_KIND(u);
10273 data = PyUnicode_DATA(u);
10274 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010275 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010276 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010277 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010278 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010279 assert(_PyUnicode_CheckConsistency(u, 1));
10280 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010281}
10282
Alexander Belopolsky40018472011-02-26 01:02:56 +000010283PyObject *
10284PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010286 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010287
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010288 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010289 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290
Benjamin Petersonead6b532011-12-20 17:23:42 -060010291 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010292 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010293 if (PyUnicode_IS_ASCII(string))
10294 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010295 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010296 PyUnicode_GET_LENGTH(string), keepends);
10297 else
10298 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010299 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010300 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301 break;
10302 case PyUnicode_2BYTE_KIND:
10303 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010304 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 PyUnicode_GET_LENGTH(string), keepends);
10306 break;
10307 case PyUnicode_4BYTE_KIND:
10308 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 PyUnicode_GET_LENGTH(string), keepends);
10311 break;
10312 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010313 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010315 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010316}
10317
Alexander Belopolsky40018472011-02-26 01:02:56 +000010318static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010319split(PyObject *self,
10320 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010321 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010323 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 void *buf1, *buf2;
10325 Py_ssize_t len1, len2;
10326 PyObject* out;
10327
Guido van Rossumd57fd912000-03-10 22:53:23 +000010328 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010329 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010330
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010331 if (PyUnicode_READY(self) == -1)
10332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010333
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010334 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010335 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010337 if (PyUnicode_IS_ASCII(self))
10338 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010339 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010340 PyUnicode_GET_LENGTH(self), maxcount
10341 );
10342 else
10343 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010345 PyUnicode_GET_LENGTH(self), maxcount
10346 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 case PyUnicode_2BYTE_KIND:
10348 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 PyUnicode_GET_LENGTH(self), maxcount
10351 );
10352 case PyUnicode_4BYTE_KIND:
10353 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 PyUnicode_GET_LENGTH(self), maxcount
10356 );
10357 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010358 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 }
10360
10361 if (PyUnicode_READY(substring) == -1)
10362 return NULL;
10363
10364 kind1 = PyUnicode_KIND(self);
10365 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 len1 = PyUnicode_GET_LENGTH(self);
10367 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010368 if (kind1 < kind2 || len1 < len2) {
10369 out = PyList_New(1);
10370 if (out == NULL)
10371 return NULL;
10372 Py_INCREF(self);
10373 PyList_SET_ITEM(out, 0, self);
10374 return out;
10375 }
10376 buf1 = PyUnicode_DATA(self);
10377 buf2 = PyUnicode_DATA(substring);
10378 if (kind2 != kind1) {
10379 buf2 = _PyUnicode_AsKind(substring, kind1);
10380 if (!buf2)
10381 return NULL;
10382 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010383
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010384 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010386 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10387 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010388 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010389 else
10390 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010391 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 break;
10393 case PyUnicode_2BYTE_KIND:
10394 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010395 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 break;
10397 case PyUnicode_4BYTE_KIND:
10398 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 break;
10401 default:
10402 out = NULL;
10403 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010404 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 PyMem_Free(buf2);
10406 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010407}
10408
Alexander Belopolsky40018472011-02-26 01:02:56 +000010409static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010410rsplit(PyObject *self,
10411 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010412 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010413{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010414 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 void *buf1, *buf2;
10416 Py_ssize_t len1, len2;
10417 PyObject* out;
10418
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010419 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010420 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 if (PyUnicode_READY(self) == -1)
10423 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010424
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010426 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010428 if (PyUnicode_IS_ASCII(self))
10429 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010430 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010431 PyUnicode_GET_LENGTH(self), maxcount
10432 );
10433 else
10434 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010435 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010436 PyUnicode_GET_LENGTH(self), maxcount
10437 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 case PyUnicode_2BYTE_KIND:
10439 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010440 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441 PyUnicode_GET_LENGTH(self), maxcount
10442 );
10443 case PyUnicode_4BYTE_KIND:
10444 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010445 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010446 PyUnicode_GET_LENGTH(self), maxcount
10447 );
10448 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010449 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 }
10451
10452 if (PyUnicode_READY(substring) == -1)
10453 return NULL;
10454
10455 kind1 = PyUnicode_KIND(self);
10456 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010457 len1 = PyUnicode_GET_LENGTH(self);
10458 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010459 if (kind1 < kind2 || len1 < len2) {
10460 out = PyList_New(1);
10461 if (out == NULL)
10462 return NULL;
10463 Py_INCREF(self);
10464 PyList_SET_ITEM(out, 0, self);
10465 return out;
10466 }
10467 buf1 = PyUnicode_DATA(self);
10468 buf2 = PyUnicode_DATA(substring);
10469 if (kind2 != kind1) {
10470 buf2 = _PyUnicode_AsKind(substring, kind1);
10471 if (!buf2)
10472 return NULL;
10473 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010474
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010475 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010477 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10478 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010479 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010480 else
10481 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010482 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 break;
10484 case PyUnicode_2BYTE_KIND:
10485 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010486 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 break;
10488 case PyUnicode_4BYTE_KIND:
10489 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010490 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 break;
10492 default:
10493 out = NULL;
10494 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010495 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010496 PyMem_Free(buf2);
10497 return out;
10498}
10499
10500static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010501anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10502 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010504 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010505 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010506 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10507 return asciilib_find(buf1, len1, buf2, len2, offset);
10508 else
10509 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010510 case PyUnicode_2BYTE_KIND:
10511 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10512 case PyUnicode_4BYTE_KIND:
10513 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10514 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010515 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516}
10517
10518static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010519anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10520 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010521{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010522 switch (kind) {
10523 case PyUnicode_1BYTE_KIND:
10524 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10525 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10526 else
10527 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10528 case PyUnicode_2BYTE_KIND:
10529 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10530 case PyUnicode_4BYTE_KIND:
10531 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10532 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010533 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010534}
10535
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010536static void
10537replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10538 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10539{
10540 int kind = PyUnicode_KIND(u);
10541 void *data = PyUnicode_DATA(u);
10542 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10543 if (kind == PyUnicode_1BYTE_KIND) {
10544 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10545 (Py_UCS1 *)data + len,
10546 u1, u2, maxcount);
10547 }
10548 else if (kind == PyUnicode_2BYTE_KIND) {
10549 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10550 (Py_UCS2 *)data + len,
10551 u1, u2, maxcount);
10552 }
10553 else {
10554 assert(kind == PyUnicode_4BYTE_KIND);
10555 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10556 (Py_UCS4 *)data + len,
10557 u1, u2, maxcount);
10558 }
10559}
10560
Alexander Belopolsky40018472011-02-26 01:02:56 +000010561static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562replace(PyObject *self, PyObject *str1,
10563 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010564{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010565 PyObject *u;
10566 char *sbuf = PyUnicode_DATA(self);
10567 char *buf1 = PyUnicode_DATA(str1);
10568 char *buf2 = PyUnicode_DATA(str2);
10569 int srelease = 0, release1 = 0, release2 = 0;
10570 int skind = PyUnicode_KIND(self);
10571 int kind1 = PyUnicode_KIND(str1);
10572 int kind2 = PyUnicode_KIND(str2);
10573 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10574 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10575 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010577 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010578
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010579 if (slen < len1)
10580 goto nothing;
10581
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010583 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010584 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010585 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586
Victor Stinner59de0ee2011-10-07 10:01:28 +020010587 if (str1 == str2)
10588 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589
Victor Stinner49a0a212011-10-12 23:46:10 +020010590 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010591 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10592 if (maxchar < maxchar_str1)
10593 /* substring too wide to be present */
10594 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010595 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10596 /* Replacing str1 with str2 may cause a maxchar reduction in the
10597 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010598 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010599 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010602 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010604 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010607 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010608 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010609
Victor Stinner69ed0f42013-04-09 21:48:24 +020010610 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010611 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010612 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010613 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010614 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010615 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010616 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010617 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010618
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010619 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10620 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010621 }
10622 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010623 int rkind = skind;
10624 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010625 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 if (kind1 < rkind) {
10628 /* widen substring */
10629 buf1 = _PyUnicode_AsKind(str1, rkind);
10630 if (!buf1) goto error;
10631 release1 = 1;
10632 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010633 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010634 if (i < 0)
10635 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010636 if (rkind > kind2) {
10637 /* widen replacement */
10638 buf2 = _PyUnicode_AsKind(str2, rkind);
10639 if (!buf2) goto error;
10640 release2 = 1;
10641 }
10642 else if (rkind < kind2) {
10643 /* widen self and buf1 */
10644 rkind = kind2;
10645 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010646 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010647 sbuf = _PyUnicode_AsKind(self, rkind);
10648 if (!sbuf) goto error;
10649 srelease = 1;
10650 buf1 = _PyUnicode_AsKind(str1, rkind);
10651 if (!buf1) goto error;
10652 release1 = 1;
10653 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010654 u = PyUnicode_New(slen, maxchar);
10655 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010656 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010657 assert(PyUnicode_KIND(u) == rkind);
10658 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010659
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010661 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010665 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010666
10667 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010668 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010669 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010670 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010671 if (i == -1)
10672 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010677 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010678 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010679 }
10680 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010681 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010682 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010683 int rkind = skind;
10684 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010685
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 buf1 = _PyUnicode_AsKind(str1, rkind);
10689 if (!buf1) goto error;
10690 release1 = 1;
10691 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010692 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 if (n == 0)
10694 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010695 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010696 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010697 buf2 = _PyUnicode_AsKind(str2, rkind);
10698 if (!buf2) goto error;
10699 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010700 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010702 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 rkind = kind2;
10704 sbuf = _PyUnicode_AsKind(self, rkind);
10705 if (!sbuf) goto error;
10706 srelease = 1;
10707 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010708 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010709 buf1 = _PyUnicode_AsKind(str1, rkind);
10710 if (!buf1) goto error;
10711 release1 = 1;
10712 }
10713 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10714 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010715 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 PyErr_SetString(PyExc_OverflowError,
10717 "replace string is too long");
10718 goto error;
10719 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010720 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010721 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010722 _Py_INCREF_UNICODE_EMPTY();
10723 if (!unicode_empty)
10724 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010725 u = unicode_empty;
10726 goto done;
10727 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010728 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 PyErr_SetString(PyExc_OverflowError,
10730 "replace string is too long");
10731 goto error;
10732 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010733 u = PyUnicode_New(new_size, maxchar);
10734 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010736 assert(PyUnicode_KIND(u) == rkind);
10737 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 ires = i = 0;
10739 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010740 while (n-- > 0) {
10741 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010742 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010743 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010744 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010745 if (j == -1)
10746 break;
10747 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010748 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010749 memcpy(res + rkind * ires,
10750 sbuf + rkind * i,
10751 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010752 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010753 }
10754 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010755 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010756 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010757 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010758 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010760 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010762 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010764 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010765 memcpy(res + rkind * ires,
10766 sbuf + rkind * i,
10767 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010768 }
10769 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010770 /* interleave */
10771 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010772 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010773 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010774 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010775 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010776 if (--n <= 0)
10777 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010778 memcpy(res + rkind * ires,
10779 sbuf + rkind * i,
10780 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 ires++;
10782 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010783 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010784 memcpy(res + rkind * ires,
10785 sbuf + rkind * i,
10786 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010787 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010788 }
10789
10790 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010791 unicode_adjust_maxchar(&u);
10792 if (u == NULL)
10793 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010794 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010795
10796 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010797 if (srelease)
10798 PyMem_FREE(sbuf);
10799 if (release1)
10800 PyMem_FREE(buf1);
10801 if (release2)
10802 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010803 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010805
Benjamin Peterson29060642009-01-31 22:14:21 +000010806 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010807 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 if (srelease)
10809 PyMem_FREE(sbuf);
10810 if (release1)
10811 PyMem_FREE(buf1);
10812 if (release2)
10813 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010814 return unicode_result_unchanged(self);
10815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 error:
10817 if (srelease && sbuf)
10818 PyMem_FREE(sbuf);
10819 if (release1 && buf1)
10820 PyMem_FREE(buf1);
10821 if (release2 && buf2)
10822 PyMem_FREE(buf2);
10823 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010824}
10825
10826/* --- Unicode Object Methods --------------------------------------------- */
10827
INADA Naoki3ae20562017-01-16 20:41:20 +090010828/*[clinic input]
10829str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
INADA Naoki3ae20562017-01-16 20:41:20 +090010831Return a version of the string where each word is titlecased.
10832
10833More specifically, words start with uppercased characters and all remaining
10834cased characters have lower case.
10835[clinic start generated code]*/
10836
10837static PyObject *
10838unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010839/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010841 if (PyUnicode_READY(self) == -1)
10842 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010843 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844}
10845
INADA Naoki3ae20562017-01-16 20:41:20 +090010846/*[clinic input]
10847str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848
INADA Naoki3ae20562017-01-16 20:41:20 +090010849Return a capitalized version of the string.
10850
10851More specifically, make the first character have upper case and the rest lower
10852case.
10853[clinic start generated code]*/
10854
10855static PyObject *
10856unicode_capitalize_impl(PyObject *self)
10857/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010859 if (PyUnicode_READY(self) == -1)
10860 return NULL;
10861 if (PyUnicode_GET_LENGTH(self) == 0)
10862 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010863 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010864}
10865
INADA Naoki3ae20562017-01-16 20:41:20 +090010866/*[clinic input]
10867str.casefold as unicode_casefold
10868
10869Return a version of the string suitable for caseless comparisons.
10870[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010871
10872static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010873unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010874/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010875{
10876 if (PyUnicode_READY(self) == -1)
10877 return NULL;
10878 if (PyUnicode_IS_ASCII(self))
10879 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010880 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010881}
10882
10883
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010884/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010885
10886static int
10887convert_uc(PyObject *obj, void *addr)
10888{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010889 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010890
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010891 if (!PyUnicode_Check(obj)) {
10892 PyErr_Format(PyExc_TypeError,
10893 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010894 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010895 return 0;
10896 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010897 if (PyUnicode_READY(obj) < 0)
10898 return 0;
10899 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010900 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010901 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010902 return 0;
10903 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010904 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010905 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010906}
10907
INADA Naoki3ae20562017-01-16 20:41:20 +090010908/*[clinic input]
10909str.center as unicode_center
10910
10911 width: Py_ssize_t
10912 fillchar: Py_UCS4 = ' '
10913 /
10914
10915Return a centered string of length width.
10916
10917Padding is done using the specified fill character (default is a space).
10918[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010919
10920static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010921unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10922/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010924 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010925
Benjamin Petersonbac79492012-01-14 13:34:47 -050010926 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927 return NULL;
10928
Victor Stinnerc4b49542011-12-11 22:44:26 +010010929 if (PyUnicode_GET_LENGTH(self) >= width)
10930 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931
Victor Stinnerc4b49542011-12-11 22:44:26 +010010932 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010933 left = marg / 2 + (marg & width & 1);
10934
Victor Stinner9310abb2011-10-05 00:59:23 +020010935 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010936}
10937
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010938/* This function assumes that str1 and str2 are readied by the caller. */
10939
Marc-André Lemburge5034372000-08-08 08:04:29 +000010940static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010941unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010942{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010943#define COMPARE(TYPE1, TYPE2) \
10944 do { \
10945 TYPE1* p1 = (TYPE1 *)data1; \
10946 TYPE2* p2 = (TYPE2 *)data2; \
10947 TYPE1* end = p1 + len; \
10948 Py_UCS4 c1, c2; \
10949 for (; p1 != end; p1++, p2++) { \
10950 c1 = *p1; \
10951 c2 = *p2; \
10952 if (c1 != c2) \
10953 return (c1 < c2) ? -1 : 1; \
10954 } \
10955 } \
10956 while (0)
10957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010958 int kind1, kind2;
10959 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010960 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 kind1 = PyUnicode_KIND(str1);
10963 kind2 = PyUnicode_KIND(str2);
10964 data1 = PyUnicode_DATA(str1);
10965 data2 = PyUnicode_DATA(str2);
10966 len1 = PyUnicode_GET_LENGTH(str1);
10967 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010968 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010969
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010970 switch(kind1) {
10971 case PyUnicode_1BYTE_KIND:
10972 {
10973 switch(kind2) {
10974 case PyUnicode_1BYTE_KIND:
10975 {
10976 int cmp = memcmp(data1, data2, len);
10977 /* normalize result of memcmp() into the range [-1; 1] */
10978 if (cmp < 0)
10979 return -1;
10980 if (cmp > 0)
10981 return 1;
10982 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010983 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010984 case PyUnicode_2BYTE_KIND:
10985 COMPARE(Py_UCS1, Py_UCS2);
10986 break;
10987 case PyUnicode_4BYTE_KIND:
10988 COMPARE(Py_UCS1, Py_UCS4);
10989 break;
10990 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010991 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010992 }
10993 break;
10994 }
10995 case PyUnicode_2BYTE_KIND:
10996 {
10997 switch(kind2) {
10998 case PyUnicode_1BYTE_KIND:
10999 COMPARE(Py_UCS2, Py_UCS1);
11000 break;
11001 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011002 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011003 COMPARE(Py_UCS2, Py_UCS2);
11004 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011005 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011006 case PyUnicode_4BYTE_KIND:
11007 COMPARE(Py_UCS2, Py_UCS4);
11008 break;
11009 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011010 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011011 }
11012 break;
11013 }
11014 case PyUnicode_4BYTE_KIND:
11015 {
11016 switch(kind2) {
11017 case PyUnicode_1BYTE_KIND:
11018 COMPARE(Py_UCS4, Py_UCS1);
11019 break;
11020 case PyUnicode_2BYTE_KIND:
11021 COMPARE(Py_UCS4, Py_UCS2);
11022 break;
11023 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011024 {
11025#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11026 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11027 /* normalize result of wmemcmp() into the range [-1; 1] */
11028 if (cmp < 0)
11029 return -1;
11030 if (cmp > 0)
11031 return 1;
11032#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011033 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011034#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011035 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011036 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011037 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011038 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011039 }
11040 break;
11041 }
11042 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011043 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011044 }
11045
Victor Stinner770e19e2012-10-04 22:59:45 +020011046 if (len1 == len2)
11047 return 0;
11048 if (len1 < len2)
11049 return -1;
11050 else
11051 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011052
11053#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011054}
11055
Benjamin Peterson621b4302016-09-09 13:54:34 -070011056static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011057unicode_compare_eq(PyObject *str1, PyObject *str2)
11058{
11059 int kind;
11060 void *data1, *data2;
11061 Py_ssize_t len;
11062 int cmp;
11063
Victor Stinnere5567ad2012-10-23 02:48:49 +020011064 len = PyUnicode_GET_LENGTH(str1);
11065 if (PyUnicode_GET_LENGTH(str2) != len)
11066 return 0;
11067 kind = PyUnicode_KIND(str1);
11068 if (PyUnicode_KIND(str2) != kind)
11069 return 0;
11070 data1 = PyUnicode_DATA(str1);
11071 data2 = PyUnicode_DATA(str2);
11072
11073 cmp = memcmp(data1, data2, len * kind);
11074 return (cmp == 0);
11075}
11076
11077
Alexander Belopolsky40018472011-02-26 01:02:56 +000011078int
11079PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011080{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011081 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11082 if (PyUnicode_READY(left) == -1 ||
11083 PyUnicode_READY(right) == -1)
11084 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011085
11086 /* a string is equal to itself */
11087 if (left == right)
11088 return 0;
11089
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011090 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011091 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011092 PyErr_Format(PyExc_TypeError,
11093 "Can't compare %.100s and %.100s",
11094 left->ob_type->tp_name,
11095 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011096 return -1;
11097}
11098
Martin v. Löwis5b222132007-06-10 09:51:05 +000011099int
11100PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011102 Py_ssize_t i;
11103 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011104 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011105 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106
Victor Stinner910337b2011-10-03 03:20:16 +020011107 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011108 if (!PyUnicode_IS_READY(uni)) {
11109 const wchar_t *ws = _PyUnicode_WSTR(uni);
11110 /* Compare Unicode string and source character set string */
11111 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11112 if (chr != ustr[i])
11113 return (chr < ustr[i]) ? -1 : 1;
11114 }
11115 /* This check keeps Python strings that end in '\0' from comparing equal
11116 to C strings identical up to that point. */
11117 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11118 return 1; /* uni is longer */
11119 if (ustr[i])
11120 return -1; /* str is longer */
11121 return 0;
11122 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011123 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011124 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011125 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011126 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011127 size_t len, len2 = strlen(str);
11128 int cmp;
11129
11130 len = Py_MIN(len1, len2);
11131 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011132 if (cmp != 0) {
11133 if (cmp < 0)
11134 return -1;
11135 else
11136 return 1;
11137 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011138 if (len1 > len2)
11139 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011140 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011141 return -1; /* str is longer */
11142 return 0;
11143 }
11144 else {
11145 void *data = PyUnicode_DATA(uni);
11146 /* Compare Unicode string and source character set string */
11147 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011148 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011149 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11150 /* This check keeps Python strings that end in '\0' from comparing equal
11151 to C strings identical up to that point. */
11152 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11153 return 1; /* uni is longer */
11154 if (str[i])
11155 return -1; /* str is longer */
11156 return 0;
11157 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011158}
11159
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011160static int
11161non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11162{
11163 size_t i, len;
11164 const wchar_t *p;
11165 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11166 if (strlen(str) != len)
11167 return 0;
11168 p = _PyUnicode_WSTR(unicode);
11169 assert(p);
11170 for (i = 0; i < len; i++) {
11171 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011172 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011173 return 0;
11174 }
11175 return 1;
11176}
11177
11178int
11179_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11180{
11181 size_t len;
11182 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011183 assert(str);
11184#ifndef NDEBUG
11185 for (const char *p = str; *p; p++) {
11186 assert((unsigned char)*p < 128);
11187 }
11188#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011189 if (PyUnicode_READY(unicode) == -1) {
11190 /* Memory error or bad data */
11191 PyErr_Clear();
11192 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11193 }
11194 if (!PyUnicode_IS_ASCII(unicode))
11195 return 0;
11196 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11197 return strlen(str) == len &&
11198 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11199}
11200
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011201int
11202_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11203{
11204 PyObject *right_uni;
11205 Py_hash_t hash;
11206
11207 assert(_PyUnicode_CHECK(left));
11208 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011209#ifndef NDEBUG
11210 for (const char *p = right->string; *p; p++) {
11211 assert((unsigned char)*p < 128);
11212 }
11213#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011214
11215 if (PyUnicode_READY(left) == -1) {
11216 /* memory error or bad data */
11217 PyErr_Clear();
11218 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11219 }
11220
11221 if (!PyUnicode_IS_ASCII(left))
11222 return 0;
11223
11224 right_uni = _PyUnicode_FromId(right); /* borrowed */
11225 if (right_uni == NULL) {
11226 /* memory error or bad data */
11227 PyErr_Clear();
11228 return _PyUnicode_EqualToASCIIString(left, right->string);
11229 }
11230
11231 if (left == right_uni)
11232 return 1;
11233
11234 if (PyUnicode_CHECK_INTERNED(left))
11235 return 0;
11236
INADA Naoki7cc95f52018-01-28 02:07:09 +090011237 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011238 hash = _PyUnicode_HASH(left);
11239 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11240 return 0;
11241
11242 return unicode_compare_eq(left, right_uni);
11243}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011244
Alexander Belopolsky40018472011-02-26 01:02:56 +000011245PyObject *
11246PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011247{
11248 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011249
Victor Stinnere5567ad2012-10-23 02:48:49 +020011250 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11251 Py_RETURN_NOTIMPLEMENTED;
11252
11253 if (PyUnicode_READY(left) == -1 ||
11254 PyUnicode_READY(right) == -1)
11255 return NULL;
11256
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011257 if (left == right) {
11258 switch (op) {
11259 case Py_EQ:
11260 case Py_LE:
11261 case Py_GE:
11262 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011263 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011264 case Py_NE:
11265 case Py_LT:
11266 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011267 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011268 default:
11269 PyErr_BadArgument();
11270 return NULL;
11271 }
11272 }
11273 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011274 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011275 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011276 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011277 }
11278 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011279 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011280 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011281 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011282}
11283
Alexander Belopolsky40018472011-02-26 01:02:56 +000011284int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011285_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11286{
11287 return unicode_eq(aa, bb);
11288}
11289
11290int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011292{
Victor Stinner77282cb2013-04-14 19:22:47 +020011293 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011294 void *buf1, *buf2;
11295 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011296 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011297
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011298 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011299 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011300 "'in <string>' requires string as left operand, not %.100s",
11301 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011302 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011303 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011304 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011305 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011306 if (ensure_unicode(str) < 0)
11307 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011308
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011309 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011310 kind2 = PyUnicode_KIND(substr);
11311 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011312 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011314 len2 = PyUnicode_GET_LENGTH(substr);
11315 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011316 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011317 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011318 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011319 if (len2 == 1) {
11320 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11321 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011322 return result;
11323 }
11324 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011325 buf2 = _PyUnicode_AsKind(substr, kind1);
11326 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011327 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011328 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011329
Victor Stinner77282cb2013-04-14 19:22:47 +020011330 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011331 case PyUnicode_1BYTE_KIND:
11332 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11333 break;
11334 case PyUnicode_2BYTE_KIND:
11335 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11336 break;
11337 case PyUnicode_4BYTE_KIND:
11338 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11339 break;
11340 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011341 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011342 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011343
Victor Stinner77282cb2013-04-14 19:22:47 +020011344 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011345 PyMem_Free(buf2);
11346
Guido van Rossum403d68b2000-03-13 15:55:09 +000011347 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011348}
11349
Guido van Rossumd57fd912000-03-10 22:53:23 +000011350/* Concat to string or Unicode object giving a new Unicode object. */
11351
Alexander Belopolsky40018472011-02-26 01:02:56 +000011352PyObject *
11353PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011355 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011356 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011357 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011359 if (ensure_unicode(left) < 0)
11360 return NULL;
11361
11362 if (!PyUnicode_Check(right)) {
11363 PyErr_Format(PyExc_TypeError,
11364 "can only concatenate str (not \"%.200s\") to str",
11365 right->ob_type->tp_name);
11366 return NULL;
11367 }
11368 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011370
11371 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011372 if (left == unicode_empty)
11373 return PyUnicode_FromObject(right);
11374 if (right == unicode_empty)
11375 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011376
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011377 left_len = PyUnicode_GET_LENGTH(left);
11378 right_len = PyUnicode_GET_LENGTH(right);
11379 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011380 PyErr_SetString(PyExc_OverflowError,
11381 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011382 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011383 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011384 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011385
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011386 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11387 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011388 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011389
Guido van Rossumd57fd912000-03-10 22:53:23 +000011390 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011391 result = PyUnicode_New(new_len, maxchar);
11392 if (result == NULL)
11393 return NULL;
11394 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11395 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11396 assert(_PyUnicode_CheckConsistency(result, 1));
11397 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011398}
11399
Walter Dörwald1ab83302007-05-18 17:15:44 +000011400void
Victor Stinner23e56682011-10-03 03:54:37 +020011401PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011402{
Victor Stinner23e56682011-10-03 03:54:37 +020011403 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011404 Py_UCS4 maxchar, maxchar2;
11405 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011406
11407 if (p_left == NULL) {
11408 if (!PyErr_Occurred())
11409 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011410 return;
11411 }
Victor Stinner23e56682011-10-03 03:54:37 +020011412 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011413 if (right == NULL || left == NULL
11414 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011415 if (!PyErr_Occurred())
11416 PyErr_BadInternalCall();
11417 goto error;
11418 }
11419
Benjamin Petersonbac79492012-01-14 13:34:47 -050011420 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011421 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011422 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011423 goto error;
11424
Victor Stinner488fa492011-12-12 00:01:39 +010011425 /* Shortcuts */
11426 if (left == unicode_empty) {
11427 Py_DECREF(left);
11428 Py_INCREF(right);
11429 *p_left = right;
11430 return;
11431 }
11432 if (right == unicode_empty)
11433 return;
11434
11435 left_len = PyUnicode_GET_LENGTH(left);
11436 right_len = PyUnicode_GET_LENGTH(right);
11437 if (left_len > PY_SSIZE_T_MAX - right_len) {
11438 PyErr_SetString(PyExc_OverflowError,
11439 "strings are too large to concat");
11440 goto error;
11441 }
11442 new_len = left_len + right_len;
11443
11444 if (unicode_modifiable(left)
11445 && PyUnicode_CheckExact(right)
11446 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011447 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11448 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011449 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011450 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011451 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11452 {
11453 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011454 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011455 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011456
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011457 /* copy 'right' into the newly allocated area of 'left' */
11458 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011459 }
Victor Stinner488fa492011-12-12 00:01:39 +010011460 else {
11461 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11462 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011463 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011464
Victor Stinner488fa492011-12-12 00:01:39 +010011465 /* Concat the two Unicode strings */
11466 res = PyUnicode_New(new_len, maxchar);
11467 if (res == NULL)
11468 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011469 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11470 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011471 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011472 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011473 }
11474 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011475 return;
11476
11477error:
Victor Stinner488fa492011-12-12 00:01:39 +010011478 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011479}
11480
11481void
11482PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11483{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011484 PyUnicode_Append(pleft, right);
11485 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011486}
11487
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011488/*
11489Wraps stringlib_parse_args_finds() and additionally ensures that the
11490first argument is a unicode object.
11491*/
11492
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011493static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011494parse_args_finds_unicode(const char * function_name, PyObject *args,
11495 PyObject **substring,
11496 Py_ssize_t *start, Py_ssize_t *end)
11497{
11498 if(stringlib_parse_args_finds(function_name, args, substring,
11499 start, end)) {
11500 if (ensure_unicode(*substring) < 0)
11501 return 0;
11502 return 1;
11503 }
11504 return 0;
11505}
11506
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011507PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011508 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011509\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011510Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011511string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011512interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513
11514static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011515unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011517 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011518 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011519 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011521 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011522 void *buf1, *buf2;
11523 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011525 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011526 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011527
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011528 kind1 = PyUnicode_KIND(self);
11529 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011530 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011531 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011532
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011533 len1 = PyUnicode_GET_LENGTH(self);
11534 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011535 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011536 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011537 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011538
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011539 buf1 = PyUnicode_DATA(self);
11540 buf2 = PyUnicode_DATA(substring);
11541 if (kind2 != kind1) {
11542 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011543 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011544 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011545 }
11546 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011547 case PyUnicode_1BYTE_KIND:
11548 iresult = ucs1lib_count(
11549 ((Py_UCS1*)buf1) + start, end - start,
11550 buf2, len2, PY_SSIZE_T_MAX
11551 );
11552 break;
11553 case PyUnicode_2BYTE_KIND:
11554 iresult = ucs2lib_count(
11555 ((Py_UCS2*)buf1) + start, end - start,
11556 buf2, len2, PY_SSIZE_T_MAX
11557 );
11558 break;
11559 case PyUnicode_4BYTE_KIND:
11560 iresult = ucs4lib_count(
11561 ((Py_UCS4*)buf1) + start, end - start,
11562 buf2, len2, PY_SSIZE_T_MAX
11563 );
11564 break;
11565 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011566 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011567 }
11568
11569 result = PyLong_FromSsize_t(iresult);
11570
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011571 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011572 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 return result;
11575}
11576
INADA Naoki3ae20562017-01-16 20:41:20 +090011577/*[clinic input]
11578str.encode as unicode_encode
11579
11580 encoding: str(c_default="NULL") = 'utf-8'
11581 The encoding in which to encode the string.
11582 errors: str(c_default="NULL") = 'strict'
11583 The error handling scheme to use for encoding errors.
11584 The default is 'strict' meaning that encoding errors raise a
11585 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11586 'xmlcharrefreplace' as well as any other name registered with
11587 codecs.register_error that can handle UnicodeEncodeErrors.
11588
11589Encode the string using the codec registered for encoding.
11590[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011591
11592static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011593unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011594/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011596 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011597}
11598
INADA Naoki3ae20562017-01-16 20:41:20 +090011599/*[clinic input]
11600str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011601
INADA Naoki3ae20562017-01-16 20:41:20 +090011602 tabsize: int = 8
11603
11604Return a copy where all tab characters are expanded using spaces.
11605
11606If tabsize is not given, a tab size of 8 characters is assumed.
11607[clinic start generated code]*/
11608
11609static PyObject *
11610unicode_expandtabs_impl(PyObject *self, int tabsize)
11611/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011613 Py_ssize_t i, j, line_pos, src_len, incr;
11614 Py_UCS4 ch;
11615 PyObject *u;
11616 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011617 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011618 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619
Antoine Pitrou22425222011-10-04 19:10:51 +020011620 if (PyUnicode_READY(self) == -1)
11621 return NULL;
11622
Thomas Wouters7e474022000-07-16 12:04:32 +000011623 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011624 src_len = PyUnicode_GET_LENGTH(self);
11625 i = j = line_pos = 0;
11626 kind = PyUnicode_KIND(self);
11627 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011628 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011629 for (; i < src_len; i++) {
11630 ch = PyUnicode_READ(kind, src_data, i);
11631 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011632 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011633 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011634 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011635 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011636 goto overflow;
11637 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011638 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011639 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011640 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011641 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011643 goto overflow;
11644 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011646 if (ch == '\n' || ch == '\r')
11647 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011648 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011649 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011650 if (!found)
11651 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011652
Guido van Rossumd57fd912000-03-10 22:53:23 +000011653 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011654 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655 if (!u)
11656 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011657 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658
Antoine Pitroue71d5742011-10-04 15:55:09 +020011659 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660
Antoine Pitroue71d5742011-10-04 15:55:09 +020011661 for (; i < src_len; i++) {
11662 ch = PyUnicode_READ(kind, src_data, i);
11663 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011664 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011665 incr = tabsize - (line_pos % tabsize);
11666 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011667 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011668 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011669 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011670 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011671 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011672 line_pos++;
11673 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011674 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011675 if (ch == '\n' || ch == '\r')
11676 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011678 }
11679 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011680 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011681
Antoine Pitroue71d5742011-10-04 15:55:09 +020011682 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011683 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11684 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685}
11686
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011687PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011688 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689\n\
11690Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011691such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692arguments start and end are interpreted as in slice notation.\n\
11693\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011694Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
11696static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011699 /* initialize variables to prevent gcc warning */
11700 PyObject *substring = NULL;
11701 Py_ssize_t start = 0;
11702 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011703 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011705 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011707
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011708 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011710
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011711 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011712
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 if (result == -2)
11714 return NULL;
11715
Christian Heimes217cfd12007-12-02 14:31:20 +000011716 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011717}
11718
11719static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011720unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011722 void *data;
11723 enum PyUnicode_Kind kind;
11724 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011725
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011726 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011727 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011728 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011729 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011730 if (PyUnicode_READY(self) == -1) {
11731 return NULL;
11732 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011733 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11734 PyErr_SetString(PyExc_IndexError, "string index out of range");
11735 return NULL;
11736 }
11737 kind = PyUnicode_KIND(self);
11738 data = PyUnicode_DATA(self);
11739 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011740 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741}
11742
Guido van Rossumc2504932007-09-18 19:42:40 +000011743/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011744 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011745static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011746unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011748 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011749
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011750#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011751 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011752#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 if (_PyUnicode_HASH(self) != -1)
11754 return _PyUnicode_HASH(self);
11755 if (PyUnicode_READY(self) == -1)
11756 return -1;
animalizea1d14252019-01-02 20:16:06 +080011757
Christian Heimes985ecdc2013-11-20 11:46:18 +010011758 x = _Py_HashBytes(PyUnicode_DATA(self),
11759 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011761 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011762}
11763
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011764PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011765 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766\n\
oldkaa0735f2018-02-02 16:52:55 +080011767Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011768such that sub is contained within S[start:end]. Optional\n\
11769arguments start and end are interpreted as in slice notation.\n\
11770\n\
11771Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772
11773static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011774unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011775{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011776 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011777 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011778 PyObject *substring = NULL;
11779 Py_ssize_t start = 0;
11780 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011781
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011782 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011783 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011785 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011786 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011787
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011788 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 if (result == -2)
11791 return NULL;
11792
Guido van Rossumd57fd912000-03-10 22:53:23 +000011793 if (result < 0) {
11794 PyErr_SetString(PyExc_ValueError, "substring not found");
11795 return NULL;
11796 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011797
Christian Heimes217cfd12007-12-02 14:31:20 +000011798 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011799}
11800
INADA Naoki3ae20562017-01-16 20:41:20 +090011801/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011802str.isascii as unicode_isascii
11803
11804Return True if all characters in the string are ASCII, False otherwise.
11805
11806ASCII characters have code points in the range U+0000-U+007F.
11807Empty string is ASCII too.
11808[clinic start generated code]*/
11809
11810static PyObject *
11811unicode_isascii_impl(PyObject *self)
11812/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11813{
11814 if (PyUnicode_READY(self) == -1) {
11815 return NULL;
11816 }
11817 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11818}
11819
11820/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011821str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011822
INADA Naoki3ae20562017-01-16 20:41:20 +090011823Return True if the string is a lowercase string, False otherwise.
11824
11825A string is lowercase if all cased characters in the string are lowercase and
11826there is at least one cased character in the string.
11827[clinic start generated code]*/
11828
11829static PyObject *
11830unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011831/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011832{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 Py_ssize_t i, length;
11834 int kind;
11835 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 int cased;
11837
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 if (PyUnicode_READY(self) == -1)
11839 return NULL;
11840 length = PyUnicode_GET_LENGTH(self);
11841 kind = PyUnicode_KIND(self);
11842 data = PyUnicode_DATA(self);
11843
Guido van Rossumd57fd912000-03-10 22:53:23 +000011844 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 if (length == 1)
11846 return PyBool_FromLong(
11847 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011849 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011851 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011852
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 for (i = 0; i < length; i++) {
11855 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011856
Benjamin Peterson29060642009-01-31 22:14:21 +000011857 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011858 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011859 else if (!cased && Py_UNICODE_ISLOWER(ch))
11860 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011861 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011862 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011863}
11864
INADA Naoki3ae20562017-01-16 20:41:20 +090011865/*[clinic input]
11866str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867
INADA Naoki3ae20562017-01-16 20:41:20 +090011868Return True if the string is an uppercase string, False otherwise.
11869
11870A string is uppercase if all cased characters in the string are uppercase and
11871there is at least one cased character in the string.
11872[clinic start generated code]*/
11873
11874static PyObject *
11875unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011876/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011878 Py_ssize_t i, length;
11879 int kind;
11880 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 int cased;
11882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011883 if (PyUnicode_READY(self) == -1)
11884 return NULL;
11885 length = PyUnicode_GET_LENGTH(self);
11886 kind = PyUnicode_KIND(self);
11887 data = PyUnicode_DATA(self);
11888
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011890 if (length == 1)
11891 return PyBool_FromLong(
11892 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011894 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011895 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011896 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011897
Guido van Rossumd57fd912000-03-10 22:53:23 +000011898 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 for (i = 0; i < length; i++) {
11900 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011901
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011903 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011904 else if (!cased && Py_UNICODE_ISUPPER(ch))
11905 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011907 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908}
11909
INADA Naoki3ae20562017-01-16 20:41:20 +090011910/*[clinic input]
11911str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
INADA Naoki3ae20562017-01-16 20:41:20 +090011913Return True if the string is a title-cased string, False otherwise.
11914
11915In a title-cased string, upper- and title-case characters may only
11916follow uncased characters and lowercase characters only cased ones.
11917[clinic start generated code]*/
11918
11919static PyObject *
11920unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011921/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011922{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011923 Py_ssize_t i, length;
11924 int kind;
11925 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926 int cased, previous_is_cased;
11927
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011928 if (PyUnicode_READY(self) == -1)
11929 return NULL;
11930 length = PyUnicode_GET_LENGTH(self);
11931 kind = PyUnicode_KIND(self);
11932 data = PyUnicode_DATA(self);
11933
Guido van Rossumd57fd912000-03-10 22:53:23 +000011934 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011935 if (length == 1) {
11936 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11937 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11938 (Py_UNICODE_ISUPPER(ch) != 0));
11939 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011940
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011941 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011943 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011944
Guido van Rossumd57fd912000-03-10 22:53:23 +000011945 cased = 0;
11946 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 for (i = 0; i < length; i++) {
11948 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011949
Benjamin Peterson29060642009-01-31 22:14:21 +000011950 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11951 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011952 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011953 previous_is_cased = 1;
11954 cased = 1;
11955 }
11956 else if (Py_UNICODE_ISLOWER(ch)) {
11957 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011958 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011959 previous_is_cased = 1;
11960 cased = 1;
11961 }
11962 else
11963 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011964 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011965 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011966}
11967
INADA Naoki3ae20562017-01-16 20:41:20 +090011968/*[clinic input]
11969str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970
INADA Naoki3ae20562017-01-16 20:41:20 +090011971Return True if the string is a whitespace string, False otherwise.
11972
11973A string is whitespace if all characters in the string are whitespace and there
11974is at least one character in the string.
11975[clinic start generated code]*/
11976
11977static PyObject *
11978unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011979/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011981 Py_ssize_t i, length;
11982 int kind;
11983 void *data;
11984
11985 if (PyUnicode_READY(self) == -1)
11986 return NULL;
11987 length = PyUnicode_GET_LENGTH(self);
11988 kind = PyUnicode_KIND(self);
11989 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011990
Guido van Rossumd57fd912000-03-10 22:53:23 +000011991 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 if (length == 1)
11993 return PyBool_FromLong(
11994 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011996 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011998 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 for (i = 0; i < length; i++) {
12001 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012002 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012003 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012004 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012005 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006}
12007
INADA Naoki3ae20562017-01-16 20:41:20 +090012008/*[clinic input]
12009str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012010
INADA Naoki3ae20562017-01-16 20:41:20 +090012011Return True if the string is an alphabetic string, False otherwise.
12012
12013A string is alphabetic if all characters in the string are alphabetic and there
12014is at least one character in the string.
12015[clinic start generated code]*/
12016
12017static PyObject *
12018unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012019/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 Py_ssize_t i, length;
12022 int kind;
12023 void *data;
12024
12025 if (PyUnicode_READY(self) == -1)
12026 return NULL;
12027 length = PyUnicode_GET_LENGTH(self);
12028 kind = PyUnicode_KIND(self);
12029 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012030
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012031 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 if (length == 1)
12033 return PyBool_FromLong(
12034 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012035
12036 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012038 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 for (i = 0; i < length; i++) {
12041 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012042 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012043 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012044 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012045}
12046
INADA Naoki3ae20562017-01-16 20:41:20 +090012047/*[clinic input]
12048str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012049
INADA Naoki3ae20562017-01-16 20:41:20 +090012050Return True if the string is an alpha-numeric string, False otherwise.
12051
12052A string is alpha-numeric if all characters in the string are alpha-numeric and
12053there is at least one character in the string.
12054[clinic start generated code]*/
12055
12056static PyObject *
12057unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012058/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 int kind;
12061 void *data;
12062 Py_ssize_t len, i;
12063
12064 if (PyUnicode_READY(self) == -1)
12065 return NULL;
12066
12067 kind = PyUnicode_KIND(self);
12068 data = PyUnicode_DATA(self);
12069 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012070
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012071 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 if (len == 1) {
12073 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12074 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12075 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012076
12077 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012079 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012080
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012081 for (i = 0; i < len; i++) {
12082 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012083 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012084 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012085 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012086 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012087}
12088
INADA Naoki3ae20562017-01-16 20:41:20 +090012089/*[clinic input]
12090str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091
INADA Naoki3ae20562017-01-16 20:41:20 +090012092Return True if the string is a decimal string, False otherwise.
12093
12094A string is a decimal string if all characters in the string are decimal and
12095there is at least one character in the string.
12096[clinic start generated code]*/
12097
12098static PyObject *
12099unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012100/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012102 Py_ssize_t i, length;
12103 int kind;
12104 void *data;
12105
12106 if (PyUnicode_READY(self) == -1)
12107 return NULL;
12108 length = PyUnicode_GET_LENGTH(self);
12109 kind = PyUnicode_KIND(self);
12110 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012111
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012113 if (length == 1)
12114 return PyBool_FromLong(
12115 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012117 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012118 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012119 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012120
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 for (i = 0; i < length; i++) {
12122 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012123 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012125 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012126}
12127
INADA Naoki3ae20562017-01-16 20:41:20 +090012128/*[clinic input]
12129str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130
INADA Naoki3ae20562017-01-16 20:41:20 +090012131Return True if the string is a digit string, False otherwise.
12132
12133A string is a digit string if all characters in the string are digits and there
12134is at least one character in the string.
12135[clinic start generated code]*/
12136
12137static PyObject *
12138unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012139/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012140{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012141 Py_ssize_t i, length;
12142 int kind;
12143 void *data;
12144
12145 if (PyUnicode_READY(self) == -1)
12146 return NULL;
12147 length = PyUnicode_GET_LENGTH(self);
12148 kind = PyUnicode_KIND(self);
12149 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012150
Guido van Rossumd57fd912000-03-10 22:53:23 +000012151 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012152 if (length == 1) {
12153 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12154 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12155 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012156
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012157 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012159 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 for (i = 0; i < length; i++) {
12162 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012163 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012164 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012165 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012166}
12167
INADA Naoki3ae20562017-01-16 20:41:20 +090012168/*[clinic input]
12169str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170
INADA Naoki3ae20562017-01-16 20:41:20 +090012171Return True if the string is a numeric string, False otherwise.
12172
12173A string is numeric if all characters in the string are numeric and there is at
12174least one character in the string.
12175[clinic start generated code]*/
12176
12177static PyObject *
12178unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012179/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012180{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012181 Py_ssize_t i, length;
12182 int kind;
12183 void *data;
12184
12185 if (PyUnicode_READY(self) == -1)
12186 return NULL;
12187 length = PyUnicode_GET_LENGTH(self);
12188 kind = PyUnicode_KIND(self);
12189 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012190
Guido van Rossumd57fd912000-03-10 22:53:23 +000012191 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 if (length == 1)
12193 return PyBool_FromLong(
12194 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012196 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012197 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012198 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012199
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012200 for (i = 0; i < length; i++) {
12201 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012202 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012203 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012204 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012205}
12206
Martin v. Löwis47383402007-08-15 07:32:56 +000012207int
12208PyUnicode_IsIdentifier(PyObject *self)
12209{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012210 int kind;
12211 void *data;
12212 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012213 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012214
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012215 if (PyUnicode_READY(self) == -1) {
12216 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012217 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012218 }
12219
12220 /* Special case for empty strings */
12221 if (PyUnicode_GET_LENGTH(self) == 0)
12222 return 0;
12223 kind = PyUnicode_KIND(self);
12224 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012225
12226 /* PEP 3131 says that the first character must be in
12227 XID_Start and subsequent characters in XID_Continue,
12228 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012229 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012230 letters, digits, underscore). However, given the current
12231 definition of XID_Start and XID_Continue, it is sufficient
12232 to check just for these, except that _ must be allowed
12233 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012234 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012235 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012236 return 0;
12237
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012238 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012240 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012241 return 1;
12242}
12243
INADA Naoki3ae20562017-01-16 20:41:20 +090012244/*[clinic input]
12245str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012246
INADA Naoki3ae20562017-01-16 20:41:20 +090012247Return True if the string is a valid Python identifier, False otherwise.
12248
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012249Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012250such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012251[clinic start generated code]*/
12252
12253static PyObject *
12254unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012255/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012256{
12257 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12258}
12259
INADA Naoki3ae20562017-01-16 20:41:20 +090012260/*[clinic input]
12261str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012262
INADA Naoki3ae20562017-01-16 20:41:20 +090012263Return True if the string is printable, False otherwise.
12264
12265A string is printable if all of its characters are considered printable in
12266repr() or if it is empty.
12267[clinic start generated code]*/
12268
12269static PyObject *
12270unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012271/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012272{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012273 Py_ssize_t i, length;
12274 int kind;
12275 void *data;
12276
12277 if (PyUnicode_READY(self) == -1)
12278 return NULL;
12279 length = PyUnicode_GET_LENGTH(self);
12280 kind = PyUnicode_KIND(self);
12281 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012282
12283 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012284 if (length == 1)
12285 return PyBool_FromLong(
12286 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012287
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 for (i = 0; i < length; i++) {
12289 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012290 Py_RETURN_FALSE;
12291 }
12292 }
12293 Py_RETURN_TRUE;
12294}
12295
INADA Naoki3ae20562017-01-16 20:41:20 +090012296/*[clinic input]
12297str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012298
INADA Naoki3ae20562017-01-16 20:41:20 +090012299 iterable: object
12300 /
12301
12302Concatenate any number of strings.
12303
Martin Panter91a88662017-01-24 00:30:06 +000012304The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012305The result is returned as a new string.
12306
12307Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12308[clinic start generated code]*/
12309
12310static PyObject *
12311unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012312/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012313{
INADA Naoki3ae20562017-01-16 20:41:20 +090012314 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012315}
12316
Martin v. Löwis18e16552006-02-15 17:27:45 +000012317static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012318unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012320 if (PyUnicode_READY(self) == -1)
12321 return -1;
12322 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323}
12324
INADA Naoki3ae20562017-01-16 20:41:20 +090012325/*[clinic input]
12326str.ljust as unicode_ljust
12327
12328 width: Py_ssize_t
12329 fillchar: Py_UCS4 = ' '
12330 /
12331
12332Return a left-justified string of length width.
12333
12334Padding is done using the specified fill character (default is a space).
12335[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012336
12337static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012338unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12339/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012341 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012343
Victor Stinnerc4b49542011-12-11 22:44:26 +010012344 if (PyUnicode_GET_LENGTH(self) >= width)
12345 return unicode_result_unchanged(self);
12346
12347 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012348}
12349
INADA Naoki3ae20562017-01-16 20:41:20 +090012350/*[clinic input]
12351str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352
INADA Naoki3ae20562017-01-16 20:41:20 +090012353Return a copy of the string converted to lowercase.
12354[clinic start generated code]*/
12355
12356static PyObject *
12357unicode_lower_impl(PyObject *self)
12358/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012359{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012360 if (PyUnicode_READY(self) == -1)
12361 return NULL;
12362 if (PyUnicode_IS_ASCII(self))
12363 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012364 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012365}
12366
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012367#define LEFTSTRIP 0
12368#define RIGHTSTRIP 1
12369#define BOTHSTRIP 2
12370
12371/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012372static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012373
INADA Naoki3ae20562017-01-16 20:41:20 +090012374#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012375
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012376/* externally visible for str.strip(unicode) */
12377PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012378_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012379{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012380 void *data;
12381 int kind;
12382 Py_ssize_t i, j, len;
12383 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012384 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12387 return NULL;
12388
12389 kind = PyUnicode_KIND(self);
12390 data = PyUnicode_DATA(self);
12391 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012392 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012393 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12394 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012395 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012396
Benjamin Peterson14339b62009-01-31 16:36:08 +000012397 i = 0;
12398 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012399 while (i < len) {
12400 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12401 if (!BLOOM(sepmask, ch))
12402 break;
12403 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12404 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012405 i++;
12406 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012407 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012408
Benjamin Peterson14339b62009-01-31 16:36:08 +000012409 j = len;
12410 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012411 j--;
12412 while (j >= i) {
12413 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12414 if (!BLOOM(sepmask, ch))
12415 break;
12416 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12417 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012418 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012419 }
12420
Benjamin Peterson29060642009-01-31 22:14:21 +000012421 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012423
Victor Stinner7931d9a2011-11-04 00:22:48 +010012424 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012425}
12426
12427PyObject*
12428PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12429{
12430 unsigned char *data;
12431 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012432 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012433
Victor Stinnerde636f32011-10-01 03:55:54 +020012434 if (PyUnicode_READY(self) == -1)
12435 return NULL;
12436
Victor Stinner684d5fd2012-05-03 02:32:34 +020012437 length = PyUnicode_GET_LENGTH(self);
12438 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012439
Victor Stinner684d5fd2012-05-03 02:32:34 +020012440 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012441 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012442
Victor Stinnerde636f32011-10-01 03:55:54 +020012443 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012444 PyErr_SetString(PyExc_IndexError, "string index out of range");
12445 return NULL;
12446 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012447 if (start >= length || end < start)
12448 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012449
Victor Stinner684d5fd2012-05-03 02:32:34 +020012450 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012451 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012452 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012453 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012454 }
12455 else {
12456 kind = PyUnicode_KIND(self);
12457 data = PyUnicode_1BYTE_DATA(self);
12458 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012459 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012460 length);
12461 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012462}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012463
12464static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012465do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012466{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012467 Py_ssize_t len, i, j;
12468
12469 if (PyUnicode_READY(self) == -1)
12470 return NULL;
12471
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012472 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012473
Victor Stinnercc7af722013-04-09 22:39:24 +020012474 if (PyUnicode_IS_ASCII(self)) {
12475 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12476
12477 i = 0;
12478 if (striptype != RIGHTSTRIP) {
12479 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012480 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012481 if (!_Py_ascii_whitespace[ch])
12482 break;
12483 i++;
12484 }
12485 }
12486
12487 j = len;
12488 if (striptype != LEFTSTRIP) {
12489 j--;
12490 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012491 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012492 if (!_Py_ascii_whitespace[ch])
12493 break;
12494 j--;
12495 }
12496 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012497 }
12498 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012499 else {
12500 int kind = PyUnicode_KIND(self);
12501 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012502
Victor Stinnercc7af722013-04-09 22:39:24 +020012503 i = 0;
12504 if (striptype != RIGHTSTRIP) {
12505 while (i < len) {
12506 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12507 if (!Py_UNICODE_ISSPACE(ch))
12508 break;
12509 i++;
12510 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012511 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012512
12513 j = len;
12514 if (striptype != LEFTSTRIP) {
12515 j--;
12516 while (j >= i) {
12517 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12518 if (!Py_UNICODE_ISSPACE(ch))
12519 break;
12520 j--;
12521 }
12522 j++;
12523 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012524 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012525
Victor Stinner7931d9a2011-11-04 00:22:48 +010012526 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527}
12528
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012529
12530static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012531do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012532{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012533 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012534 if (PyUnicode_Check(sep))
12535 return _PyUnicode_XStrip(self, striptype, sep);
12536 else {
12537 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012538 "%s arg must be None or str",
12539 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012540 return NULL;
12541 }
12542 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012543
Benjamin Peterson14339b62009-01-31 16:36:08 +000012544 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012545}
12546
12547
INADA Naoki3ae20562017-01-16 20:41:20 +090012548/*[clinic input]
12549str.strip as unicode_strip
12550
12551 chars: object = None
12552 /
12553
Zachary Ware09895c22019-10-09 16:09:00 -050012554Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012555
12556If chars is given and not None, remove characters in chars instead.
12557[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012558
12559static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012560unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012561/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012562{
INADA Naoki3ae20562017-01-16 20:41:20 +090012563 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012564}
12565
12566
INADA Naoki3ae20562017-01-16 20:41:20 +090012567/*[clinic input]
12568str.lstrip as unicode_lstrip
12569
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012570 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012571 /
12572
12573Return a copy of the string with leading whitespace removed.
12574
12575If chars is given and not None, remove characters in chars instead.
12576[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012577
12578static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012579unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012580/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012581{
INADA Naoki3ae20562017-01-16 20:41:20 +090012582 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012583}
12584
12585
INADA Naoki3ae20562017-01-16 20:41:20 +090012586/*[clinic input]
12587str.rstrip as unicode_rstrip
12588
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012589 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012590 /
12591
12592Return a copy of the string with trailing whitespace removed.
12593
12594If chars is given and not None, remove characters in chars instead.
12595[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012596
12597static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012598unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012599/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012600{
INADA Naoki3ae20562017-01-16 20:41:20 +090012601 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012602}
12603
12604
Guido van Rossumd57fd912000-03-10 22:53:23 +000012605static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012606unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012607{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012608 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012609 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012610
Serhiy Storchaka05997252013-01-26 12:14:02 +020012611 if (len < 1)
12612 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012613
Victor Stinnerc4b49542011-12-11 22:44:26 +010012614 /* no repeat, return original string */
12615 if (len == 1)
12616 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012617
Benjamin Petersonbac79492012-01-14 13:34:47 -050012618 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012619 return NULL;
12620
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012621 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012622 PyErr_SetString(PyExc_OverflowError,
12623 "repeated string is too long");
12624 return NULL;
12625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012627
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012628 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012629 if (!u)
12630 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012631 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012633 if (PyUnicode_GET_LENGTH(str) == 1) {
12634 const int kind = PyUnicode_KIND(str);
12635 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012636 if (kind == PyUnicode_1BYTE_KIND) {
12637 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012638 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012639 }
12640 else if (kind == PyUnicode_2BYTE_KIND) {
12641 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012642 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012643 ucs2[n] = fill_char;
12644 } else {
12645 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12646 assert(kind == PyUnicode_4BYTE_KIND);
12647 for (n = 0; n < len; ++n)
12648 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012649 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 }
12651 else {
12652 /* number of characters copied this far */
12653 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012654 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012656 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012658 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012660 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012661 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012663 }
12664
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012665 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012666 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667}
12668
Alexander Belopolsky40018472011-02-26 01:02:56 +000012669PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012670PyUnicode_Replace(PyObject *str,
12671 PyObject *substr,
12672 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012673 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012674{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012675 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12676 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012677 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012678 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012679}
12680
INADA Naoki3ae20562017-01-16 20:41:20 +090012681/*[clinic input]
12682str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683
INADA Naoki3ae20562017-01-16 20:41:20 +090012684 old: unicode
12685 new: unicode
12686 count: Py_ssize_t = -1
12687 Maximum number of occurrences to replace.
12688 -1 (the default value) means replace all occurrences.
12689 /
12690
12691Return a copy with all occurrences of substring old replaced by new.
12692
12693If the optional argument count is given, only the first count occurrences are
12694replaced.
12695[clinic start generated code]*/
12696
12697static PyObject *
12698unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12699 Py_ssize_t count)
12700/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012701{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012702 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012703 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012704 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705}
12706
Alexander Belopolsky40018472011-02-26 01:02:56 +000012707static PyObject *
12708unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012710 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 Py_ssize_t isize;
12712 Py_ssize_t osize, squote, dquote, i, o;
12713 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012714 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012717 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012718 return NULL;
12719
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012720 isize = PyUnicode_GET_LENGTH(unicode);
12721 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012723 /* Compute length of output, quote characters, and
12724 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012725 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012726 max = 127;
12727 squote = dquote = 0;
12728 ikind = PyUnicode_KIND(unicode);
12729 for (i = 0; i < isize; i++) {
12730 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012731 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012732 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012733 case '\'': squote++; break;
12734 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012735 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012736 incr = 2;
12737 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012738 default:
12739 /* Fast-path ASCII */
12740 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012741 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012743 ;
12744 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012745 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012747 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012748 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012749 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012751 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012753 if (osize > PY_SSIZE_T_MAX - incr) {
12754 PyErr_SetString(PyExc_OverflowError,
12755 "string is too long to generate repr");
12756 return NULL;
12757 }
12758 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012759 }
12760
12761 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012762 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012764 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012765 if (dquote)
12766 /* Both squote and dquote present. Use squote,
12767 and escape them */
12768 osize += squote;
12769 else
12770 quote = '"';
12771 }
Victor Stinner55c08782013-04-14 18:45:39 +020012772 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012773
12774 repr = PyUnicode_New(osize, max);
12775 if (repr == NULL)
12776 return NULL;
12777 okind = PyUnicode_KIND(repr);
12778 odata = PyUnicode_DATA(repr);
12779
12780 PyUnicode_WRITE(okind, odata, 0, quote);
12781 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012782 if (unchanged) {
12783 _PyUnicode_FastCopyCharacters(repr, 1,
12784 unicode, 0,
12785 isize);
12786 }
12787 else {
12788 for (i = 0, o = 1; i < isize; i++) {
12789 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012790
Victor Stinner55c08782013-04-14 18:45:39 +020012791 /* Escape quotes and backslashes */
12792 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012793 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012795 continue;
12796 }
12797
12798 /* Map special whitespace to '\t', \n', '\r' */
12799 if (ch == '\t') {
12800 PyUnicode_WRITE(okind, odata, o++, '\\');
12801 PyUnicode_WRITE(okind, odata, o++, 't');
12802 }
12803 else if (ch == '\n') {
12804 PyUnicode_WRITE(okind, odata, o++, '\\');
12805 PyUnicode_WRITE(okind, odata, o++, 'n');
12806 }
12807 else if (ch == '\r') {
12808 PyUnicode_WRITE(okind, odata, o++, '\\');
12809 PyUnicode_WRITE(okind, odata, o++, 'r');
12810 }
12811
12812 /* Map non-printable US ASCII to '\xhh' */
12813 else if (ch < ' ' || ch == 0x7F) {
12814 PyUnicode_WRITE(okind, odata, o++, '\\');
12815 PyUnicode_WRITE(okind, odata, o++, 'x');
12816 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12817 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12818 }
12819
12820 /* Copy ASCII characters as-is */
12821 else if (ch < 0x7F) {
12822 PyUnicode_WRITE(okind, odata, o++, ch);
12823 }
12824
12825 /* Non-ASCII characters */
12826 else {
12827 /* Map Unicode whitespace and control characters
12828 (categories Z* and C* except ASCII space)
12829 */
12830 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12831 PyUnicode_WRITE(okind, odata, o++, '\\');
12832 /* Map 8-bit characters to '\xhh' */
12833 if (ch <= 0xff) {
12834 PyUnicode_WRITE(okind, odata, o++, 'x');
12835 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12836 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12837 }
12838 /* Map 16-bit characters to '\uxxxx' */
12839 else if (ch <= 0xffff) {
12840 PyUnicode_WRITE(okind, odata, o++, 'u');
12841 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12842 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12843 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12844 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12845 }
12846 /* Map 21-bit characters to '\U00xxxxxx' */
12847 else {
12848 PyUnicode_WRITE(okind, odata, o++, 'U');
12849 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12850 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12851 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12852 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12853 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12854 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12855 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12856 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12857 }
12858 }
12859 /* Copy characters as-is */
12860 else {
12861 PyUnicode_WRITE(okind, odata, o++, ch);
12862 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012863 }
12864 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012865 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012866 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012867 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012868 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012869}
12870
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012871PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012872 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873\n\
12874Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012875such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876arguments start and end are interpreted as in slice notation.\n\
12877\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012878Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012879
12880static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012881unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012883 /* initialize variables to prevent gcc warning */
12884 PyObject *substring = NULL;
12885 Py_ssize_t start = 0;
12886 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012887 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012889 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012890 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012892 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012893 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012894
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012895 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012896
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 if (result == -2)
12898 return NULL;
12899
Christian Heimes217cfd12007-12-02 14:31:20 +000012900 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901}
12902
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012903PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012904 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012906Return the highest index in S where substring sub is found,\n\
12907such that sub is contained within S[start:end]. Optional\n\
12908arguments start and end are interpreted as in slice notation.\n\
12909\n\
12910Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012911
12912static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012913unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012914{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012915 /* initialize variables to prevent gcc warning */
12916 PyObject *substring = NULL;
12917 Py_ssize_t start = 0;
12918 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012919 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012921 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012922 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012923
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012924 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012925 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012927 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 if (result == -2)
12930 return NULL;
12931
Guido van Rossumd57fd912000-03-10 22:53:23 +000012932 if (result < 0) {
12933 PyErr_SetString(PyExc_ValueError, "substring not found");
12934 return NULL;
12935 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012936
Christian Heimes217cfd12007-12-02 14:31:20 +000012937 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012938}
12939
INADA Naoki3ae20562017-01-16 20:41:20 +090012940/*[clinic input]
12941str.rjust as unicode_rjust
12942
12943 width: Py_ssize_t
12944 fillchar: Py_UCS4 = ' '
12945 /
12946
12947Return a right-justified string of length width.
12948
12949Padding is done using the specified fill character (default is a space).
12950[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012951
12952static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012953unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12954/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012956 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012957 return NULL;
12958
Victor Stinnerc4b49542011-12-11 22:44:26 +010012959 if (PyUnicode_GET_LENGTH(self) >= width)
12960 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961
Victor Stinnerc4b49542011-12-11 22:44:26 +010012962 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012963}
12964
Alexander Belopolsky40018472011-02-26 01:02:56 +000012965PyObject *
12966PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012968 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012969 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012970
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012971 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012972}
12973
INADA Naoki3ae20562017-01-16 20:41:20 +090012974/*[clinic input]
12975str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976
INADA Naoki3ae20562017-01-16 20:41:20 +090012977 sep: object = None
12978 The delimiter according which to split the string.
12979 None (the default value) means split according to any whitespace,
12980 and discard empty strings from the result.
12981 maxsplit: Py_ssize_t = -1
12982 Maximum number of splits to do.
12983 -1 (the default value) means no limit.
12984
12985Return a list of the words in the string, using sep as the delimiter string.
12986[clinic start generated code]*/
12987
12988static PyObject *
12989unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12990/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012991{
INADA Naoki3ae20562017-01-16 20:41:20 +090012992 if (sep == Py_None)
12993 return split(self, NULL, maxsplit);
12994 if (PyUnicode_Check(sep))
12995 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012996
Victor Stinner998b8062018-09-12 00:23:25 +020012997 PyErr_Format(PyExc_TypeError,
12998 "must be str or None, not %.100s",
12999 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013000 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013001}
13002
Thomas Wouters477c8d52006-05-27 19:21:47 +000013003PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013004PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013005{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013006 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013007 int kind1, kind2;
13008 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013010
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013011 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013012 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013013
Victor Stinner14f8f022011-10-05 20:58:25 +020013014 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013015 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 len1 = PyUnicode_GET_LENGTH(str_obj);
13017 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013018 if (kind1 < kind2 || len1 < len2) {
13019 _Py_INCREF_UNICODE_EMPTY();
13020 if (!unicode_empty)
13021 out = NULL;
13022 else {
13023 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13024 Py_DECREF(unicode_empty);
13025 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013026 return out;
13027 }
13028 buf1 = PyUnicode_DATA(str_obj);
13029 buf2 = PyUnicode_DATA(sep_obj);
13030 if (kind2 != kind1) {
13031 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13032 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013033 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013034 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013035
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013036 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013037 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013038 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13039 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13040 else
13041 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013042 break;
13043 case PyUnicode_2BYTE_KIND:
13044 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13045 break;
13046 case PyUnicode_4BYTE_KIND:
13047 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13048 break;
13049 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013050 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013051 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013053 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013054 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013055
13056 return out;
13057}
13058
13059
13060PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013061PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013063 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013064 int kind1, kind2;
13065 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013067
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013068 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013070
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013071 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013073 len1 = PyUnicode_GET_LENGTH(str_obj);
13074 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013075 if (kind1 < kind2 || len1 < len2) {
13076 _Py_INCREF_UNICODE_EMPTY();
13077 if (!unicode_empty)
13078 out = NULL;
13079 else {
13080 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13081 Py_DECREF(unicode_empty);
13082 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013083 return out;
13084 }
13085 buf1 = PyUnicode_DATA(str_obj);
13086 buf2 = PyUnicode_DATA(sep_obj);
13087 if (kind2 != kind1) {
13088 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13089 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013090 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013093 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013095 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13096 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13097 else
13098 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 break;
13100 case PyUnicode_2BYTE_KIND:
13101 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13102 break;
13103 case PyUnicode_4BYTE_KIND:
13104 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13105 break;
13106 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013107 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013109
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013110 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013112
13113 return out;
13114}
13115
INADA Naoki3ae20562017-01-16 20:41:20 +090013116/*[clinic input]
13117str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013118
INADA Naoki3ae20562017-01-16 20:41:20 +090013119 sep: object
13120 /
13121
13122Partition the string into three parts using the given separator.
13123
13124This will search for the separator in the string. If the separator is found,
13125returns a 3-tuple containing the part before the separator, the separator
13126itself, and the part after it.
13127
13128If the separator is not found, returns a 3-tuple containing the original string
13129and two empty strings.
13130[clinic start generated code]*/
13131
13132static PyObject *
13133unicode_partition(PyObject *self, PyObject *sep)
13134/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013135{
INADA Naoki3ae20562017-01-16 20:41:20 +090013136 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013137}
13138
INADA Naoki3ae20562017-01-16 20:41:20 +090013139/*[clinic input]
13140str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013141
INADA Naoki3ae20562017-01-16 20:41:20 +090013142Partition the string into three parts using the given separator.
13143
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013144This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013145the separator is found, returns a 3-tuple containing the part before the
13146separator, the separator itself, and the part after it.
13147
13148If the separator is not found, returns a 3-tuple containing two empty strings
13149and the original string.
13150[clinic start generated code]*/
13151
13152static PyObject *
13153unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013154/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013155{
INADA Naoki3ae20562017-01-16 20:41:20 +090013156 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013157}
13158
Alexander Belopolsky40018472011-02-26 01:02:56 +000013159PyObject *
13160PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013161{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013162 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013163 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013164
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013165 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013166}
13167
INADA Naoki3ae20562017-01-16 20:41:20 +090013168/*[clinic input]
13169str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013170
INADA Naoki3ae20562017-01-16 20:41:20 +090013171Return a list of the words in the string, using sep as the delimiter string.
13172
13173Splits are done starting at the end of the string and working to the front.
13174[clinic start generated code]*/
13175
13176static PyObject *
13177unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13178/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013179{
INADA Naoki3ae20562017-01-16 20:41:20 +090013180 if (sep == Py_None)
13181 return rsplit(self, NULL, maxsplit);
13182 if (PyUnicode_Check(sep))
13183 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013184
Victor Stinner998b8062018-09-12 00:23:25 +020013185 PyErr_Format(PyExc_TypeError,
13186 "must be str or None, not %.100s",
13187 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013188 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013189}
13190
INADA Naoki3ae20562017-01-16 20:41:20 +090013191/*[clinic input]
13192str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013193
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013194 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013195
13196Return a list of the lines in the string, breaking at line boundaries.
13197
13198Line breaks are not included in the resulting list unless keepends is given and
13199true.
13200[clinic start generated code]*/
13201
13202static PyObject *
13203unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013204/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013205{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013206 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013207}
13208
13209static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013210PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013212 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013213}
13214
INADA Naoki3ae20562017-01-16 20:41:20 +090013215/*[clinic input]
13216str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217
INADA Naoki3ae20562017-01-16 20:41:20 +090013218Convert uppercase characters to lowercase and lowercase characters to uppercase.
13219[clinic start generated code]*/
13220
13221static PyObject *
13222unicode_swapcase_impl(PyObject *self)
13223/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013224{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013225 if (PyUnicode_READY(self) == -1)
13226 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013227 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228}
13229
Larry Hastings61272b72014-01-07 12:41:53 -080013230/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013231
Larry Hastings31826802013-10-19 00:09:25 -070013232@staticmethod
13233str.maketrans as unicode_maketrans
13234
13235 x: object
13236
13237 y: unicode=NULL
13238
13239 z: unicode=NULL
13240
13241 /
13242
13243Return a translation table usable for str.translate().
13244
13245If there is only one argument, it must be a dictionary mapping Unicode
13246ordinals (integers) or characters to Unicode ordinals, strings or None.
13247Character keys will be then converted to ordinals.
13248If there are two arguments, they must be strings of equal length, and
13249in the resulting dictionary, each character in x will be mapped to the
13250character at the same position in y. If there is a third argument, it
13251must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013252[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013253
Larry Hastings31826802013-10-19 00:09:25 -070013254static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013255unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013256/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013257{
Georg Brandlceee0772007-11-27 23:48:05 +000013258 PyObject *new = NULL, *key, *value;
13259 Py_ssize_t i = 0;
13260 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013261
Georg Brandlceee0772007-11-27 23:48:05 +000013262 new = PyDict_New();
13263 if (!new)
13264 return NULL;
13265 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013266 int x_kind, y_kind, z_kind;
13267 void *x_data, *y_data, *z_data;
13268
Georg Brandlceee0772007-11-27 23:48:05 +000013269 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013270 if (!PyUnicode_Check(x)) {
13271 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13272 "be a string if there is a second argument");
13273 goto err;
13274 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013275 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013276 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13277 "arguments must have equal length");
13278 goto err;
13279 }
13280 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013281 x_kind = PyUnicode_KIND(x);
13282 y_kind = PyUnicode_KIND(y);
13283 x_data = PyUnicode_DATA(x);
13284 y_data = PyUnicode_DATA(y);
13285 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13286 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013287 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013288 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013289 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013290 if (!value) {
13291 Py_DECREF(key);
13292 goto err;
13293 }
Georg Brandlceee0772007-11-27 23:48:05 +000013294 res = PyDict_SetItem(new, key, value);
13295 Py_DECREF(key);
13296 Py_DECREF(value);
13297 if (res < 0)
13298 goto err;
13299 }
13300 /* create entries for deleting chars in z */
13301 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013302 z_kind = PyUnicode_KIND(z);
13303 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013304 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013305 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013306 if (!key)
13307 goto err;
13308 res = PyDict_SetItem(new, key, Py_None);
13309 Py_DECREF(key);
13310 if (res < 0)
13311 goto err;
13312 }
13313 }
13314 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013315 int kind;
13316 void *data;
13317
Georg Brandlceee0772007-11-27 23:48:05 +000013318 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013319 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013320 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13321 "to maketrans it must be a dict");
13322 goto err;
13323 }
13324 /* copy entries into the new dict, converting string keys to int keys */
13325 while (PyDict_Next(x, &i, &key, &value)) {
13326 if (PyUnicode_Check(key)) {
13327 /* convert string keys to integer keys */
13328 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013329 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013330 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13331 "table must be of length 1");
13332 goto err;
13333 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013334 kind = PyUnicode_KIND(key);
13335 data = PyUnicode_DATA(key);
13336 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013337 if (!newkey)
13338 goto err;
13339 res = PyDict_SetItem(new, newkey, value);
13340 Py_DECREF(newkey);
13341 if (res < 0)
13342 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013343 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013344 /* just keep integer keys */
13345 if (PyDict_SetItem(new, key, value) < 0)
13346 goto err;
13347 } else {
13348 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13349 "be strings or integers");
13350 goto err;
13351 }
13352 }
13353 }
13354 return new;
13355 err:
13356 Py_DECREF(new);
13357 return NULL;
13358}
13359
INADA Naoki3ae20562017-01-16 20:41:20 +090013360/*[clinic input]
13361str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362
INADA Naoki3ae20562017-01-16 20:41:20 +090013363 table: object
13364 Translation table, which must be a mapping of Unicode ordinals to
13365 Unicode ordinals, strings, or None.
13366 /
13367
13368Replace each character in the string using the given translation table.
13369
13370The table must implement lookup/indexing via __getitem__, for instance a
13371dictionary or list. If this operation raises LookupError, the character is
13372left untouched. Characters mapped to None are deleted.
13373[clinic start generated code]*/
13374
13375static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013376unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013377/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013379 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013380}
13381
INADA Naoki3ae20562017-01-16 20:41:20 +090013382/*[clinic input]
13383str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384
INADA Naoki3ae20562017-01-16 20:41:20 +090013385Return a copy of the string converted to uppercase.
13386[clinic start generated code]*/
13387
13388static PyObject *
13389unicode_upper_impl(PyObject *self)
13390/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013392 if (PyUnicode_READY(self) == -1)
13393 return NULL;
13394 if (PyUnicode_IS_ASCII(self))
13395 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013396 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397}
13398
INADA Naoki3ae20562017-01-16 20:41:20 +090013399/*[clinic input]
13400str.zfill as unicode_zfill
13401
13402 width: Py_ssize_t
13403 /
13404
13405Pad a numeric string with zeros on the left, to fill a field of the given width.
13406
13407The string is never truncated.
13408[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013409
13410static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013411unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013412/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013414 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013415 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013416 int kind;
13417 void *data;
13418 Py_UCS4 chr;
13419
Benjamin Petersonbac79492012-01-14 13:34:47 -050013420 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013421 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422
Victor Stinnerc4b49542011-12-11 22:44:26 +010013423 if (PyUnicode_GET_LENGTH(self) >= width)
13424 return unicode_result_unchanged(self);
13425
13426 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427
13428 u = pad(self, fill, 0, '0');
13429
Walter Dörwald068325e2002-04-15 13:36:47 +000013430 if (u == NULL)
13431 return NULL;
13432
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433 kind = PyUnicode_KIND(u);
13434 data = PyUnicode_DATA(u);
13435 chr = PyUnicode_READ(kind, data, fill);
13436
13437 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013438 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013439 PyUnicode_WRITE(kind, data, 0, chr);
13440 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441 }
13442
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013443 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013444 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013446
13447#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013448static PyObject *
13449unicode__decimal2ascii(PyObject *self)
13450{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013451 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013452}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013453#endif
13454
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013455PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013456 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013458Return True if S starts with the specified prefix, False otherwise.\n\
13459With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013460With optional end, stop comparing S at that position.\n\
13461prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013462
13463static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013464unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013465 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013467 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013468 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013469 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013470 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013471 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013472
Jesus Ceaac451502011-04-20 17:09:23 +020013473 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013474 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013475 if (PyTuple_Check(subobj)) {
13476 Py_ssize_t i;
13477 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013478 substring = PyTuple_GET_ITEM(subobj, i);
13479 if (!PyUnicode_Check(substring)) {
13480 PyErr_Format(PyExc_TypeError,
13481 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013482 "not %.100s",
13483 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013484 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013485 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013486 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013487 if (result == -1)
13488 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013489 if (result) {
13490 Py_RETURN_TRUE;
13491 }
13492 }
13493 /* nothing matched */
13494 Py_RETURN_FALSE;
13495 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013496 if (!PyUnicode_Check(subobj)) {
13497 PyErr_Format(PyExc_TypeError,
13498 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013499 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013500 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013501 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013502 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013503 if (result == -1)
13504 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013505 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013506}
13507
13508
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013509PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013510 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013511\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013512Return True if S ends with the specified suffix, False otherwise.\n\
13513With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013514With optional end, stop comparing S at that position.\n\
13515suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013516
13517static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013518unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013519 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013520{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013521 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013522 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013523 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013524 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013525 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013526
Jesus Ceaac451502011-04-20 17:09:23 +020013527 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013528 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013529 if (PyTuple_Check(subobj)) {
13530 Py_ssize_t i;
13531 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013532 substring = PyTuple_GET_ITEM(subobj, i);
13533 if (!PyUnicode_Check(substring)) {
13534 PyErr_Format(PyExc_TypeError,
13535 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013536 "not %.100s",
13537 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013538 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013539 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013540 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013541 if (result == -1)
13542 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013543 if (result) {
13544 Py_RETURN_TRUE;
13545 }
13546 }
13547 Py_RETURN_FALSE;
13548 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013549 if (!PyUnicode_Check(subobj)) {
13550 PyErr_Format(PyExc_TypeError,
13551 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013552 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013553 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013554 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013555 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013556 if (result == -1)
13557 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013558 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013559}
13560
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013561static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013562_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013563{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013564 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13565 writer->data = PyUnicode_DATA(writer->buffer);
13566
13567 if (!writer->readonly) {
13568 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013569 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013570 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013571 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013572 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13573 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13574 writer->kind = PyUnicode_WCHAR_KIND;
13575 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13576
Victor Stinner8f674cc2013-04-17 23:02:17 +020013577 /* Copy-on-write mode: set buffer size to 0 so
13578 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13579 * next write. */
13580 writer->size = 0;
13581 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013582}
13583
Victor Stinnerd3f08822012-05-29 12:57:52 +020013584void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013585_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013586{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013587 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013588
13589 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013590 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013591
13592 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13593 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13594 writer->kind = PyUnicode_WCHAR_KIND;
13595 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013596}
13597
Inada Naoki770847a2019-06-24 12:30:24 +090013598// Initialize _PyUnicodeWriter with initial buffer
13599static inline void
13600_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13601{
13602 memset(writer, 0, sizeof(*writer));
13603 writer->buffer = buffer;
13604 _PyUnicodeWriter_Update(writer);
13605 writer->min_length = writer->size;
13606}
13607
Victor Stinnerd3f08822012-05-29 12:57:52 +020013608int
13609_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13610 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013611{
13612 Py_ssize_t newlen;
13613 PyObject *newbuffer;
13614
Victor Stinner2740e462016-09-06 16:58:36 -070013615 assert(maxchar <= MAX_UNICODE);
13616
Victor Stinnerca9381e2015-09-22 00:58:32 +020013617 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013618 assert((maxchar > writer->maxchar && length >= 0)
13619 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013620
Victor Stinner202fdca2012-05-07 12:47:02 +020013621 if (length > PY_SSIZE_T_MAX - writer->pos) {
13622 PyErr_NoMemory();
13623 return -1;
13624 }
13625 newlen = writer->pos + length;
13626
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013627 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013628
Victor Stinnerd3f08822012-05-29 12:57:52 +020013629 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013630 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013631 if (writer->overallocate
13632 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13633 /* overallocate to limit the number of realloc() */
13634 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013635 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013636 if (newlen < writer->min_length)
13637 newlen = writer->min_length;
13638
Victor Stinnerd3f08822012-05-29 12:57:52 +020013639 writer->buffer = PyUnicode_New(newlen, maxchar);
13640 if (writer->buffer == NULL)
13641 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013642 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013643 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013644 if (writer->overallocate
13645 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13646 /* overallocate to limit the number of realloc() */
13647 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013648 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013649 if (newlen < writer->min_length)
13650 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013651
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013652 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013653 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013654 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013655 newbuffer = PyUnicode_New(newlen, maxchar);
13656 if (newbuffer == NULL)
13657 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013658 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13659 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013660 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013661 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013662 }
13663 else {
13664 newbuffer = resize_compact(writer->buffer, newlen);
13665 if (newbuffer == NULL)
13666 return -1;
13667 }
13668 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013669 }
13670 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013671 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013672 newbuffer = PyUnicode_New(writer->size, maxchar);
13673 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013674 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013675 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13676 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013677 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013678 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013679 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013680 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013681
13682#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013683}
13684
Victor Stinnerca9381e2015-09-22 00:58:32 +020013685int
13686_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13687 enum PyUnicode_Kind kind)
13688{
13689 Py_UCS4 maxchar;
13690
13691 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13692 assert(writer->kind < kind);
13693
13694 switch (kind)
13695 {
13696 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13697 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13698 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13699 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013700 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013701 }
13702
13703 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13704}
13705
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013706static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013707_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013708{
Victor Stinner2740e462016-09-06 16:58:36 -070013709 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013710 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13711 return -1;
13712 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13713 writer->pos++;
13714 return 0;
13715}
13716
13717int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013718_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13719{
13720 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13721}
13722
13723int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013724_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13725{
13726 Py_UCS4 maxchar;
13727 Py_ssize_t len;
13728
13729 if (PyUnicode_READY(str) == -1)
13730 return -1;
13731 len = PyUnicode_GET_LENGTH(str);
13732 if (len == 0)
13733 return 0;
13734 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13735 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013736 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013737 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013738 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013739 Py_INCREF(str);
13740 writer->buffer = str;
13741 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013742 writer->pos += len;
13743 return 0;
13744 }
13745 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13746 return -1;
13747 }
13748 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13749 str, 0, len);
13750 writer->pos += len;
13751 return 0;
13752}
13753
Victor Stinnere215d962012-10-06 23:03:36 +020013754int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013755_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13756 Py_ssize_t start, Py_ssize_t end)
13757{
13758 Py_UCS4 maxchar;
13759 Py_ssize_t len;
13760
13761 if (PyUnicode_READY(str) == -1)
13762 return -1;
13763
13764 assert(0 <= start);
13765 assert(end <= PyUnicode_GET_LENGTH(str));
13766 assert(start <= end);
13767
13768 if (end == 0)
13769 return 0;
13770
13771 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13772 return _PyUnicodeWriter_WriteStr(writer, str);
13773
13774 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13775 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13776 else
13777 maxchar = writer->maxchar;
13778 len = end - start;
13779
13780 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13781 return -1;
13782
13783 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13784 str, start, len);
13785 writer->pos += len;
13786 return 0;
13787}
13788
13789int
Victor Stinner4a587072013-11-19 12:54:53 +010013790_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13791 const char *ascii, Py_ssize_t len)
13792{
13793 if (len == -1)
13794 len = strlen(ascii);
13795
13796 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13797
13798 if (writer->buffer == NULL && !writer->overallocate) {
13799 PyObject *str;
13800
13801 str = _PyUnicode_FromASCII(ascii, len);
13802 if (str == NULL)
13803 return -1;
13804
13805 writer->readonly = 1;
13806 writer->buffer = str;
13807 _PyUnicodeWriter_Update(writer);
13808 writer->pos += len;
13809 return 0;
13810 }
13811
13812 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13813 return -1;
13814
13815 switch (writer->kind)
13816 {
13817 case PyUnicode_1BYTE_KIND:
13818 {
13819 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13820 Py_UCS1 *data = writer->data;
13821
Christian Heimesf051e432016-09-13 20:22:02 +020013822 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013823 break;
13824 }
13825 case PyUnicode_2BYTE_KIND:
13826 {
13827 _PyUnicode_CONVERT_BYTES(
13828 Py_UCS1, Py_UCS2,
13829 ascii, ascii + len,
13830 (Py_UCS2 *)writer->data + writer->pos);
13831 break;
13832 }
13833 case PyUnicode_4BYTE_KIND:
13834 {
13835 _PyUnicode_CONVERT_BYTES(
13836 Py_UCS1, Py_UCS4,
13837 ascii, ascii + len,
13838 (Py_UCS4 *)writer->data + writer->pos);
13839 break;
13840 }
13841 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013842 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013843 }
13844
13845 writer->pos += len;
13846 return 0;
13847}
13848
13849int
13850_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13851 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013852{
13853 Py_UCS4 maxchar;
13854
13855 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13856 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13857 return -1;
13858 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13859 writer->pos += len;
13860 return 0;
13861}
13862
Victor Stinnerd3f08822012-05-29 12:57:52 +020013863PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013864_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013865{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013866 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013867
Victor Stinnerd3f08822012-05-29 12:57:52 +020013868 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013869 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013870 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013871 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013872
13873 str = writer->buffer;
13874 writer->buffer = NULL;
13875
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013876 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013877 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13878 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013879 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013880
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013881 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13882 PyObject *str2;
13883 str2 = resize_compact(str, writer->pos);
13884 if (str2 == NULL) {
13885 Py_DECREF(str);
13886 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013887 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013888 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013889 }
13890
Victor Stinner15a0bd32013-07-08 22:29:55 +020013891 assert(_PyUnicode_CheckConsistency(str, 1));
13892 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013893}
13894
Victor Stinnerd3f08822012-05-29 12:57:52 +020013895void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013896_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013897{
13898 Py_CLEAR(writer->buffer);
13899}
13900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013901#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013902
13903PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013904 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013905\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013906Return a formatted version of S, using substitutions from args and kwargs.\n\
13907The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013908
Eric Smith27bbca62010-11-04 17:06:58 +000013909PyDoc_STRVAR(format_map__doc__,
13910 "S.format_map(mapping) -> str\n\
13911\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013912Return a formatted version of S, using substitutions from mapping.\n\
13913The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013914
INADA Naoki3ae20562017-01-16 20:41:20 +090013915/*[clinic input]
13916str.__format__ as unicode___format__
13917
13918 format_spec: unicode
13919 /
13920
13921Return a formatted version of the string as described by format_spec.
13922[clinic start generated code]*/
13923
Eric Smith4a7d76d2008-05-30 18:10:19 +000013924static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013925unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013926/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013927{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013928 _PyUnicodeWriter writer;
13929 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013930
Victor Stinnerd3f08822012-05-29 12:57:52 +020013931 if (PyUnicode_READY(self) == -1)
13932 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013933 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013934 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13935 self, format_spec, 0,
13936 PyUnicode_GET_LENGTH(format_spec));
13937 if (ret == -1) {
13938 _PyUnicodeWriter_Dealloc(&writer);
13939 return NULL;
13940 }
13941 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013942}
13943
INADA Naoki3ae20562017-01-16 20:41:20 +090013944/*[clinic input]
13945str.__sizeof__ as unicode_sizeof
13946
13947Return the size of the string in memory, in bytes.
13948[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013949
13950static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013951unicode_sizeof_impl(PyObject *self)
13952/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013953{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013954 Py_ssize_t size;
13955
13956 /* If it's a compact object, account for base structure +
13957 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013958 if (PyUnicode_IS_COMPACT_ASCII(self))
13959 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13960 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013961 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013962 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013963 else {
13964 /* If it is a two-block object, account for base object, and
13965 for character block if present. */
13966 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013967 if (_PyUnicode_DATA_ANY(self))
13968 size += (PyUnicode_GET_LENGTH(self) + 1) *
13969 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013970 }
13971 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013972 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013973 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13974 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13975 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13976 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013977
13978 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013979}
13980
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013981static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013982unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013983{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013984 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013985 if (!copy)
13986 return NULL;
13987 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013988}
13989
Guido van Rossumd57fd912000-03-10 22:53:23 +000013990static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013991 UNICODE_ENCODE_METHODDEF
13992 UNICODE_REPLACE_METHODDEF
13993 UNICODE_SPLIT_METHODDEF
13994 UNICODE_RSPLIT_METHODDEF
13995 UNICODE_JOIN_METHODDEF
13996 UNICODE_CAPITALIZE_METHODDEF
13997 UNICODE_CASEFOLD_METHODDEF
13998 UNICODE_TITLE_METHODDEF
13999 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014000 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014001 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014002 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014003 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014004 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014005 UNICODE_LJUST_METHODDEF
14006 UNICODE_LOWER_METHODDEF
14007 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014008 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14009 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014010 UNICODE_RJUST_METHODDEF
14011 UNICODE_RSTRIP_METHODDEF
14012 UNICODE_RPARTITION_METHODDEF
14013 UNICODE_SPLITLINES_METHODDEF
14014 UNICODE_STRIP_METHODDEF
14015 UNICODE_SWAPCASE_METHODDEF
14016 UNICODE_TRANSLATE_METHODDEF
14017 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014018 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14019 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014020 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014021 UNICODE_ISLOWER_METHODDEF
14022 UNICODE_ISUPPER_METHODDEF
14023 UNICODE_ISTITLE_METHODDEF
14024 UNICODE_ISSPACE_METHODDEF
14025 UNICODE_ISDECIMAL_METHODDEF
14026 UNICODE_ISDIGIT_METHODDEF
14027 UNICODE_ISNUMERIC_METHODDEF
14028 UNICODE_ISALPHA_METHODDEF
14029 UNICODE_ISALNUM_METHODDEF
14030 UNICODE_ISIDENTIFIER_METHODDEF
14031 UNICODE_ISPRINTABLE_METHODDEF
14032 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014033 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014034 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014035 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014036 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014037 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014038#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014039 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014040 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014041#endif
14042
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014043 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014044 {NULL, NULL}
14045};
14046
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014047static PyObject *
14048unicode_mod(PyObject *v, PyObject *w)
14049{
Brian Curtindfc80e32011-08-10 20:28:54 -050014050 if (!PyUnicode_Check(v))
14051 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014052 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014053}
14054
14055static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014056 0, /*nb_add*/
14057 0, /*nb_subtract*/
14058 0, /*nb_multiply*/
14059 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014060};
14061
Guido van Rossumd57fd912000-03-10 22:53:23 +000014062static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014063 (lenfunc) unicode_length, /* sq_length */
14064 PyUnicode_Concat, /* sq_concat */
14065 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14066 (ssizeargfunc) unicode_getitem, /* sq_item */
14067 0, /* sq_slice */
14068 0, /* sq_ass_item */
14069 0, /* sq_ass_slice */
14070 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014071};
14072
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014073static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014074unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014076 if (PyUnicode_READY(self) == -1)
14077 return NULL;
14078
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014079 if (PyIndex_Check(item)) {
14080 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014081 if (i == -1 && PyErr_Occurred())
14082 return NULL;
14083 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014084 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014085 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014086 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014087 Py_ssize_t start, stop, step, slicelength, i;
14088 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014089 PyObject *result;
14090 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014091 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014092 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014093
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014094 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014095 return NULL;
14096 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014097 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14098 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014099
14100 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014101 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014102 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014103 slicelength == PyUnicode_GET_LENGTH(self)) {
14104 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014105 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014106 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014107 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014108 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014109 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014110 src_kind = PyUnicode_KIND(self);
14111 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014112 if (!PyUnicode_IS_ASCII(self)) {
14113 kind_limit = kind_maxchar_limit(src_kind);
14114 max_char = 0;
14115 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14116 ch = PyUnicode_READ(src_kind, src_data, cur);
14117 if (ch > max_char) {
14118 max_char = ch;
14119 if (max_char >= kind_limit)
14120 break;
14121 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014122 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014123 }
Victor Stinner55c99112011-10-13 01:17:06 +020014124 else
14125 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014126 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014127 if (result == NULL)
14128 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014129 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014130 dest_data = PyUnicode_DATA(result);
14131
14132 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014133 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14134 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014135 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014136 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014137 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014138 } else {
14139 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14140 return NULL;
14141 }
14142}
14143
14144static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014145 (lenfunc)unicode_length, /* mp_length */
14146 (binaryfunc)unicode_subscript, /* mp_subscript */
14147 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014148};
14149
Guido van Rossumd57fd912000-03-10 22:53:23 +000014150
Guido van Rossumd57fd912000-03-10 22:53:23 +000014151/* Helpers for PyUnicode_Format() */
14152
Victor Stinnera47082312012-10-04 02:19:54 +020014153struct unicode_formatter_t {
14154 PyObject *args;
14155 int args_owned;
14156 Py_ssize_t arglen, argidx;
14157 PyObject *dict;
14158
14159 enum PyUnicode_Kind fmtkind;
14160 Py_ssize_t fmtcnt, fmtpos;
14161 void *fmtdata;
14162 PyObject *fmtstr;
14163
14164 _PyUnicodeWriter writer;
14165};
14166
14167struct unicode_format_arg_t {
14168 Py_UCS4 ch;
14169 int flags;
14170 Py_ssize_t width;
14171 int prec;
14172 int sign;
14173};
14174
Guido van Rossumd57fd912000-03-10 22:53:23 +000014175static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014176unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014177{
Victor Stinnera47082312012-10-04 02:19:54 +020014178 Py_ssize_t argidx = ctx->argidx;
14179
14180 if (argidx < ctx->arglen) {
14181 ctx->argidx++;
14182 if (ctx->arglen < 0)
14183 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014184 else
Victor Stinnera47082312012-10-04 02:19:54 +020014185 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014186 }
14187 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014188 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014189 return NULL;
14190}
14191
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014192/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014193
Victor Stinnera47082312012-10-04 02:19:54 +020014194/* Format a float into the writer if the writer is not NULL, or into *p_output
14195 otherwise.
14196
14197 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014198static int
Victor Stinnera47082312012-10-04 02:19:54 +020014199formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14200 PyObject **p_output,
14201 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014202{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014203 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014204 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014205 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014206 int prec;
14207 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014208
Guido van Rossumd57fd912000-03-10 22:53:23 +000014209 x = PyFloat_AsDouble(v);
14210 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014211 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014212
Victor Stinnera47082312012-10-04 02:19:54 +020014213 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014214 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014215 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014216
Victor Stinnera47082312012-10-04 02:19:54 +020014217 if (arg->flags & F_ALT)
14218 dtoa_flags = Py_DTSF_ALT;
14219 else
14220 dtoa_flags = 0;
14221 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014222 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014223 return -1;
14224 len = strlen(p);
14225 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014226 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014227 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014228 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014229 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014230 }
14231 else
14232 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014233 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014234 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014235}
14236
Victor Stinnerd0880d52012-04-27 23:40:13 +020014237/* formatlong() emulates the format codes d, u, o, x and X, and
14238 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14239 * Python's regular ints.
14240 * Return value: a new PyUnicodeObject*, or NULL if error.
14241 * The output string is of the form
14242 * "-"? ("0x" | "0X")? digit+
14243 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14244 * set in flags. The case of hex digits will be correct,
14245 * There will be at least prec digits, zero-filled on the left if
14246 * necessary to get that many.
14247 * val object to be converted
14248 * flags bitmask of format flags; only F_ALT is looked at
14249 * prec minimum number of digits; 0-fill on left if needed
14250 * type a character in [duoxX]; u acts the same as d
14251 *
14252 * CAUTION: o, x and X conversions on regular ints can never
14253 * produce a '-' sign, but can for Python's unbounded ints.
14254 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014255PyObject *
14256_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014257{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014258 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014259 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014260 Py_ssize_t i;
14261 int sign; /* 1 if '-', else 0 */
14262 int len; /* number of characters */
14263 Py_ssize_t llen;
14264 int numdigits; /* len == numnondigits + numdigits */
14265 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014266
Victor Stinnerd0880d52012-04-27 23:40:13 +020014267 /* Avoid exceeding SSIZE_T_MAX */
14268 if (prec > INT_MAX-3) {
14269 PyErr_SetString(PyExc_OverflowError,
14270 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014271 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014272 }
14273
14274 assert(PyLong_Check(val));
14275
14276 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014278 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014279 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014280 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014281 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014282 /* int and int subclasses should print numerically when a numeric */
14283 /* format code is used (see issue18780) */
14284 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014285 break;
14286 case 'o':
14287 numnondigits = 2;
14288 result = PyNumber_ToBase(val, 8);
14289 break;
14290 case 'x':
14291 case 'X':
14292 numnondigits = 2;
14293 result = PyNumber_ToBase(val, 16);
14294 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014295 }
14296 if (!result)
14297 return NULL;
14298
14299 assert(unicode_modifiable(result));
14300 assert(PyUnicode_IS_READY(result));
14301 assert(PyUnicode_IS_ASCII(result));
14302
14303 /* To modify the string in-place, there can only be one reference. */
14304 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014305 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014306 PyErr_BadInternalCall();
14307 return NULL;
14308 }
14309 buf = PyUnicode_DATA(result);
14310 llen = PyUnicode_GET_LENGTH(result);
14311 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014312 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014313 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014314 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014315 return NULL;
14316 }
14317 len = (int)llen;
14318 sign = buf[0] == '-';
14319 numnondigits += sign;
14320 numdigits = len - numnondigits;
14321 assert(numdigits > 0);
14322
14323 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014324 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014325 (type == 'o' || type == 'x' || type == 'X'))) {
14326 assert(buf[sign] == '0');
14327 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14328 buf[sign+1] == 'o');
14329 numnondigits -= 2;
14330 buf += 2;
14331 len -= 2;
14332 if (sign)
14333 buf[0] = '-';
14334 assert(len == numnondigits + numdigits);
14335 assert(numdigits > 0);
14336 }
14337
14338 /* Fill with leading zeroes to meet minimum width. */
14339 if (prec > numdigits) {
14340 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14341 numnondigits + prec);
14342 char *b1;
14343 if (!r1) {
14344 Py_DECREF(result);
14345 return NULL;
14346 }
14347 b1 = PyBytes_AS_STRING(r1);
14348 for (i = 0; i < numnondigits; ++i)
14349 *b1++ = *buf++;
14350 for (i = 0; i < prec - numdigits; i++)
14351 *b1++ = '0';
14352 for (i = 0; i < numdigits; i++)
14353 *b1++ = *buf++;
14354 *b1 = '\0';
14355 Py_DECREF(result);
14356 result = r1;
14357 buf = PyBytes_AS_STRING(result);
14358 len = numnondigits + prec;
14359 }
14360
14361 /* Fix up case for hex conversions. */
14362 if (type == 'X') {
14363 /* Need to convert all lower case letters to upper case.
14364 and need to convert 0x to 0X (and -0x to -0X). */
14365 for (i = 0; i < len; i++)
14366 if (buf[i] >= 'a' && buf[i] <= 'x')
14367 buf[i] -= 'a'-'A';
14368 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014369 if (!PyUnicode_Check(result)
14370 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014371 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014372 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014373 Py_DECREF(result);
14374 result = unicode;
14375 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014376 else if (len != PyUnicode_GET_LENGTH(result)) {
14377 if (PyUnicode_Resize(&result, len) < 0)
14378 Py_CLEAR(result);
14379 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014380 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014381}
14382
Ethan Furmandf3ed242014-01-05 06:50:30 -080014383/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014384 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014385 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014386 * -1 and raise an exception on error */
14387static int
Victor Stinnera47082312012-10-04 02:19:54 +020014388mainformatlong(PyObject *v,
14389 struct unicode_format_arg_t *arg,
14390 PyObject **p_output,
14391 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014392{
14393 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014394 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014395
14396 if (!PyNumber_Check(v))
14397 goto wrongtype;
14398
Ethan Furman9ab74802014-03-21 06:38:46 -070014399 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014400 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014401 if (type == 'o' || type == 'x' || type == 'X') {
14402 iobj = PyNumber_Index(v);
14403 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014404 if (PyErr_ExceptionMatches(PyExc_TypeError))
14405 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014406 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014407 }
14408 }
14409 else {
14410 iobj = PyNumber_Long(v);
14411 if (iobj == NULL ) {
14412 if (PyErr_ExceptionMatches(PyExc_TypeError))
14413 goto wrongtype;
14414 return -1;
14415 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014416 }
14417 assert(PyLong_Check(iobj));
14418 }
14419 else {
14420 iobj = v;
14421 Py_INCREF(iobj);
14422 }
14423
14424 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014425 && arg->width == -1 && arg->prec == -1
14426 && !(arg->flags & (F_SIGN | F_BLANK))
14427 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014428 {
14429 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014430 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014431 int base;
14432
Victor Stinnera47082312012-10-04 02:19:54 +020014433 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014434 {
14435 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014436 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014437 case 'd':
14438 case 'i':
14439 case 'u':
14440 base = 10;
14441 break;
14442 case 'o':
14443 base = 8;
14444 break;
14445 case 'x':
14446 case 'X':
14447 base = 16;
14448 break;
14449 }
14450
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014451 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14452 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014453 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014454 }
14455 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014456 return 1;
14457 }
14458
Ethan Furmanb95b5612015-01-23 20:05:18 -080014459 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014460 Py_DECREF(iobj);
14461 if (res == NULL)
14462 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014463 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014464 return 0;
14465
14466wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014467 switch(type)
14468 {
14469 case 'o':
14470 case 'x':
14471 case 'X':
14472 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014473 "%%%c format: an integer is required, "
14474 "not %.200s",
14475 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014476 break;
14477 default:
14478 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014479 "%%%c format: a number is required, "
14480 "not %.200s",
14481 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014482 break;
14483 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014484 return -1;
14485}
14486
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014487static Py_UCS4
14488formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014489{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014490 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014491 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014492 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014493 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014494 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014495 goto onError;
14496 }
14497 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014498 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014499 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014500 /* make sure number is a type of integer */
14501 if (!PyLong_Check(v)) {
14502 iobj = PyNumber_Index(v);
14503 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014504 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014505 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014506 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014507 Py_DECREF(iobj);
14508 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014509 else {
14510 x = PyLong_AsLong(v);
14511 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014512 if (x == -1 && PyErr_Occurred())
14513 goto onError;
14514
Victor Stinner8faf8212011-12-08 22:14:11 +010014515 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014516 PyErr_SetString(PyExc_OverflowError,
14517 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014518 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014519 }
14520
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014521 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014522 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014523
Benjamin Peterson29060642009-01-31 22:14:21 +000014524 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014525 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014526 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014527 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014528}
14529
Victor Stinnera47082312012-10-04 02:19:54 +020014530/* Parse options of an argument: flags, width, precision.
14531 Handle also "%(name)" syntax.
14532
14533 Return 0 if the argument has been formatted into arg->str.
14534 Return 1 if the argument has been written into ctx->writer,
14535 Raise an exception and return -1 on error. */
14536static int
14537unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14538 struct unicode_format_arg_t *arg)
14539{
14540#define FORMAT_READ(ctx) \
14541 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14542
14543 PyObject *v;
14544
Victor Stinnera47082312012-10-04 02:19:54 +020014545 if (arg->ch == '(') {
14546 /* Get argument value from a dictionary. Example: "%(name)s". */
14547 Py_ssize_t keystart;
14548 Py_ssize_t keylen;
14549 PyObject *key;
14550 int pcount = 1;
14551
14552 if (ctx->dict == NULL) {
14553 PyErr_SetString(PyExc_TypeError,
14554 "format requires a mapping");
14555 return -1;
14556 }
14557 ++ctx->fmtpos;
14558 --ctx->fmtcnt;
14559 keystart = ctx->fmtpos;
14560 /* Skip over balanced parentheses */
14561 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14562 arg->ch = FORMAT_READ(ctx);
14563 if (arg->ch == ')')
14564 --pcount;
14565 else if (arg->ch == '(')
14566 ++pcount;
14567 ctx->fmtpos++;
14568 }
14569 keylen = ctx->fmtpos - keystart - 1;
14570 if (ctx->fmtcnt < 0 || pcount > 0) {
14571 PyErr_SetString(PyExc_ValueError,
14572 "incomplete format key");
14573 return -1;
14574 }
14575 key = PyUnicode_Substring(ctx->fmtstr,
14576 keystart, keystart + keylen);
14577 if (key == NULL)
14578 return -1;
14579 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014580 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014581 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014582 }
14583 ctx->args = PyObject_GetItem(ctx->dict, key);
14584 Py_DECREF(key);
14585 if (ctx->args == NULL)
14586 return -1;
14587 ctx->args_owned = 1;
14588 ctx->arglen = -1;
14589 ctx->argidx = -2;
14590 }
14591
14592 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014593 while (--ctx->fmtcnt >= 0) {
14594 arg->ch = FORMAT_READ(ctx);
14595 ctx->fmtpos++;
14596 switch (arg->ch) {
14597 case '-': arg->flags |= F_LJUST; continue;
14598 case '+': arg->flags |= F_SIGN; continue;
14599 case ' ': arg->flags |= F_BLANK; continue;
14600 case '#': arg->flags |= F_ALT; continue;
14601 case '0': arg->flags |= F_ZERO; continue;
14602 }
14603 break;
14604 }
14605
14606 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014607 if (arg->ch == '*') {
14608 v = unicode_format_getnextarg(ctx);
14609 if (v == NULL)
14610 return -1;
14611 if (!PyLong_Check(v)) {
14612 PyErr_SetString(PyExc_TypeError,
14613 "* wants int");
14614 return -1;
14615 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014616 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014617 if (arg->width == -1 && PyErr_Occurred())
14618 return -1;
14619 if (arg->width < 0) {
14620 arg->flags |= F_LJUST;
14621 arg->width = -arg->width;
14622 }
14623 if (--ctx->fmtcnt >= 0) {
14624 arg->ch = FORMAT_READ(ctx);
14625 ctx->fmtpos++;
14626 }
14627 }
14628 else if (arg->ch >= '0' && arg->ch <= '9') {
14629 arg->width = arg->ch - '0';
14630 while (--ctx->fmtcnt >= 0) {
14631 arg->ch = FORMAT_READ(ctx);
14632 ctx->fmtpos++;
14633 if (arg->ch < '0' || arg->ch > '9')
14634 break;
14635 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14636 mixing signed and unsigned comparison. Since arg->ch is between
14637 '0' and '9', casting to int is safe. */
14638 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14639 PyErr_SetString(PyExc_ValueError,
14640 "width too big");
14641 return -1;
14642 }
14643 arg->width = arg->width*10 + (arg->ch - '0');
14644 }
14645 }
14646
14647 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014648 if (arg->ch == '.') {
14649 arg->prec = 0;
14650 if (--ctx->fmtcnt >= 0) {
14651 arg->ch = FORMAT_READ(ctx);
14652 ctx->fmtpos++;
14653 }
14654 if (arg->ch == '*') {
14655 v = unicode_format_getnextarg(ctx);
14656 if (v == NULL)
14657 return -1;
14658 if (!PyLong_Check(v)) {
14659 PyErr_SetString(PyExc_TypeError,
14660 "* wants int");
14661 return -1;
14662 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014663 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014664 if (arg->prec == -1 && PyErr_Occurred())
14665 return -1;
14666 if (arg->prec < 0)
14667 arg->prec = 0;
14668 if (--ctx->fmtcnt >= 0) {
14669 arg->ch = FORMAT_READ(ctx);
14670 ctx->fmtpos++;
14671 }
14672 }
14673 else if (arg->ch >= '0' && arg->ch <= '9') {
14674 arg->prec = arg->ch - '0';
14675 while (--ctx->fmtcnt >= 0) {
14676 arg->ch = FORMAT_READ(ctx);
14677 ctx->fmtpos++;
14678 if (arg->ch < '0' || arg->ch > '9')
14679 break;
14680 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14681 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014682 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014683 return -1;
14684 }
14685 arg->prec = arg->prec*10 + (arg->ch - '0');
14686 }
14687 }
14688 }
14689
14690 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14691 if (ctx->fmtcnt >= 0) {
14692 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14693 if (--ctx->fmtcnt >= 0) {
14694 arg->ch = FORMAT_READ(ctx);
14695 ctx->fmtpos++;
14696 }
14697 }
14698 }
14699 if (ctx->fmtcnt < 0) {
14700 PyErr_SetString(PyExc_ValueError,
14701 "incomplete format");
14702 return -1;
14703 }
14704 return 0;
14705
14706#undef FORMAT_READ
14707}
14708
14709/* Format one argument. Supported conversion specifiers:
14710
14711 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014712 - "i", "d", "u": int or float
14713 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014714 - "e", "E", "f", "F", "g", "G": float
14715 - "c": int or str (1 character)
14716
Victor Stinner8dbd4212012-12-04 09:30:24 +010014717 When possible, the output is written directly into the Unicode writer
14718 (ctx->writer). A string is created when padding is required.
14719
Victor Stinnera47082312012-10-04 02:19:54 +020014720 Return 0 if the argument has been formatted into *p_str,
14721 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014722 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014723static int
14724unicode_format_arg_format(struct unicode_formatter_t *ctx,
14725 struct unicode_format_arg_t *arg,
14726 PyObject **p_str)
14727{
14728 PyObject *v;
14729 _PyUnicodeWriter *writer = &ctx->writer;
14730
14731 if (ctx->fmtcnt == 0)
14732 ctx->writer.overallocate = 0;
14733
Victor Stinnera47082312012-10-04 02:19:54 +020014734 v = unicode_format_getnextarg(ctx);
14735 if (v == NULL)
14736 return -1;
14737
Victor Stinnera47082312012-10-04 02:19:54 +020014738
14739 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014740 case 's':
14741 case 'r':
14742 case 'a':
14743 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14744 /* Fast path */
14745 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14746 return -1;
14747 return 1;
14748 }
14749
14750 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14751 *p_str = v;
14752 Py_INCREF(*p_str);
14753 }
14754 else {
14755 if (arg->ch == 's')
14756 *p_str = PyObject_Str(v);
14757 else if (arg->ch == 'r')
14758 *p_str = PyObject_Repr(v);
14759 else
14760 *p_str = PyObject_ASCII(v);
14761 }
14762 break;
14763
14764 case 'i':
14765 case 'd':
14766 case 'u':
14767 case 'o':
14768 case 'x':
14769 case 'X':
14770 {
14771 int ret = mainformatlong(v, arg, p_str, writer);
14772 if (ret != 0)
14773 return ret;
14774 arg->sign = 1;
14775 break;
14776 }
14777
14778 case 'e':
14779 case 'E':
14780 case 'f':
14781 case 'F':
14782 case 'g':
14783 case 'G':
14784 if (arg->width == -1 && arg->prec == -1
14785 && !(arg->flags & (F_SIGN | F_BLANK)))
14786 {
14787 /* Fast path */
14788 if (formatfloat(v, arg, NULL, writer) == -1)
14789 return -1;
14790 return 1;
14791 }
14792
14793 arg->sign = 1;
14794 if (formatfloat(v, arg, p_str, NULL) == -1)
14795 return -1;
14796 break;
14797
14798 case 'c':
14799 {
14800 Py_UCS4 ch = formatchar(v);
14801 if (ch == (Py_UCS4) -1)
14802 return -1;
14803 if (arg->width == -1 && arg->prec == -1) {
14804 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014805 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014806 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014807 return 1;
14808 }
14809 *p_str = PyUnicode_FromOrdinal(ch);
14810 break;
14811 }
14812
14813 default:
14814 PyErr_Format(PyExc_ValueError,
14815 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014816 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014817 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14818 (int)arg->ch,
14819 ctx->fmtpos - 1);
14820 return -1;
14821 }
14822 if (*p_str == NULL)
14823 return -1;
14824 assert (PyUnicode_Check(*p_str));
14825 return 0;
14826}
14827
14828static int
14829unicode_format_arg_output(struct unicode_formatter_t *ctx,
14830 struct unicode_format_arg_t *arg,
14831 PyObject *str)
14832{
14833 Py_ssize_t len;
14834 enum PyUnicode_Kind kind;
14835 void *pbuf;
14836 Py_ssize_t pindex;
14837 Py_UCS4 signchar;
14838 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014839 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014840 Py_ssize_t sublen;
14841 _PyUnicodeWriter *writer = &ctx->writer;
14842 Py_UCS4 fill;
14843
14844 fill = ' ';
14845 if (arg->sign && arg->flags & F_ZERO)
14846 fill = '0';
14847
14848 if (PyUnicode_READY(str) == -1)
14849 return -1;
14850
14851 len = PyUnicode_GET_LENGTH(str);
14852 if ((arg->width == -1 || arg->width <= len)
14853 && (arg->prec == -1 || arg->prec >= len)
14854 && !(arg->flags & (F_SIGN | F_BLANK)))
14855 {
14856 /* Fast path */
14857 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14858 return -1;
14859 return 0;
14860 }
14861
14862 /* Truncate the string for "s", "r" and "a" formats
14863 if the precision is set */
14864 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14865 if (arg->prec >= 0 && len > arg->prec)
14866 len = arg->prec;
14867 }
14868
14869 /* Adjust sign and width */
14870 kind = PyUnicode_KIND(str);
14871 pbuf = PyUnicode_DATA(str);
14872 pindex = 0;
14873 signchar = '\0';
14874 if (arg->sign) {
14875 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14876 if (ch == '-' || ch == '+') {
14877 signchar = ch;
14878 len--;
14879 pindex++;
14880 }
14881 else if (arg->flags & F_SIGN)
14882 signchar = '+';
14883 else if (arg->flags & F_BLANK)
14884 signchar = ' ';
14885 else
14886 arg->sign = 0;
14887 }
14888 if (arg->width < len)
14889 arg->width = len;
14890
14891 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014892 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014893 if (!(arg->flags & F_LJUST)) {
14894 if (arg->sign) {
14895 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014896 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014897 }
14898 else {
14899 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014900 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014901 }
14902 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014903 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14904 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014905 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014906 }
14907
Victor Stinnera47082312012-10-04 02:19:54 +020014908 buflen = arg->width;
14909 if (arg->sign && len == arg->width)
14910 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014911 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014912 return -1;
14913
14914 /* Write the sign if needed */
14915 if (arg->sign) {
14916 if (fill != ' ') {
14917 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14918 writer->pos += 1;
14919 }
14920 if (arg->width > len)
14921 arg->width--;
14922 }
14923
14924 /* Write the numeric prefix for "x", "X" and "o" formats
14925 if the alternate form is used.
14926 For example, write "0x" for the "%#x" format. */
14927 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14928 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14929 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14930 if (fill != ' ') {
14931 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14932 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14933 writer->pos += 2;
14934 pindex += 2;
14935 }
14936 arg->width -= 2;
14937 if (arg->width < 0)
14938 arg->width = 0;
14939 len -= 2;
14940 }
14941
14942 /* Pad left with the fill character if needed */
14943 if (arg->width > len && !(arg->flags & F_LJUST)) {
14944 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014945 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014946 writer->pos += sublen;
14947 arg->width = len;
14948 }
14949
14950 /* If padding with spaces: write sign if needed and/or numeric prefix if
14951 the alternate form is used */
14952 if (fill == ' ') {
14953 if (arg->sign) {
14954 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14955 writer->pos += 1;
14956 }
14957 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14958 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14959 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14960 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14961 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14962 writer->pos += 2;
14963 pindex += 2;
14964 }
14965 }
14966
14967 /* Write characters */
14968 if (len) {
14969 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14970 str, pindex, len);
14971 writer->pos += len;
14972 }
14973
14974 /* Pad right with the fill character if needed */
14975 if (arg->width > len) {
14976 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014977 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014978 writer->pos += sublen;
14979 }
14980 return 0;
14981}
14982
14983/* Helper of PyUnicode_Format(): format one arg.
14984 Return 0 on success, raise an exception and return -1 on error. */
14985static int
14986unicode_format_arg(struct unicode_formatter_t *ctx)
14987{
14988 struct unicode_format_arg_t arg;
14989 PyObject *str;
14990 int ret;
14991
Victor Stinner8dbd4212012-12-04 09:30:24 +010014992 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014993 if (arg.ch == '%') {
14994 ctx->fmtpos++;
14995 ctx->fmtcnt--;
14996 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14997 return -1;
14998 return 0;
14999 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015000 arg.flags = 0;
15001 arg.width = -1;
15002 arg.prec = -1;
15003 arg.sign = 0;
15004 str = NULL;
15005
Victor Stinnera47082312012-10-04 02:19:54 +020015006 ret = unicode_format_arg_parse(ctx, &arg);
15007 if (ret == -1)
15008 return -1;
15009
15010 ret = unicode_format_arg_format(ctx, &arg, &str);
15011 if (ret == -1)
15012 return -1;
15013
15014 if (ret != 1) {
15015 ret = unicode_format_arg_output(ctx, &arg, str);
15016 Py_DECREF(str);
15017 if (ret == -1)
15018 return -1;
15019 }
15020
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015021 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015022 PyErr_SetString(PyExc_TypeError,
15023 "not all arguments converted during string formatting");
15024 return -1;
15025 }
15026 return 0;
15027}
15028
Alexander Belopolsky40018472011-02-26 01:02:56 +000015029PyObject *
15030PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015031{
Victor Stinnera47082312012-10-04 02:19:54 +020015032 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015033
Guido van Rossumd57fd912000-03-10 22:53:23 +000015034 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015035 PyErr_BadInternalCall();
15036 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015037 }
Victor Stinnera47082312012-10-04 02:19:54 +020015038
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015039 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015040 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015041
15042 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015043 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15044 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15045 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15046 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015047
Victor Stinner8f674cc2013-04-17 23:02:17 +020015048 _PyUnicodeWriter_Init(&ctx.writer);
15049 ctx.writer.min_length = ctx.fmtcnt + 100;
15050 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015051
Guido van Rossumd57fd912000-03-10 22:53:23 +000015052 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015053 ctx.arglen = PyTuple_Size(args);
15054 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015055 }
15056 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015057 ctx.arglen = -1;
15058 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015059 }
Victor Stinnera47082312012-10-04 02:19:54 +020015060 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015061 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015062 ctx.dict = args;
15063 else
15064 ctx.dict = NULL;
15065 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015066
Victor Stinnera47082312012-10-04 02:19:54 +020015067 while (--ctx.fmtcnt >= 0) {
15068 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015069 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015070
15071 nonfmtpos = ctx.fmtpos++;
15072 while (ctx.fmtcnt >= 0 &&
15073 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15074 ctx.fmtpos++;
15075 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015076 }
Victor Stinnera47082312012-10-04 02:19:54 +020015077 if (ctx.fmtcnt < 0) {
15078 ctx.fmtpos--;
15079 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015080 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015081
Victor Stinnercfc4c132013-04-03 01:48:39 +020015082 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15083 nonfmtpos, ctx.fmtpos) < 0)
15084 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015085 }
15086 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015087 ctx.fmtpos++;
15088 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015089 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015090 }
15091 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015092
Victor Stinnera47082312012-10-04 02:19:54 +020015093 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015094 PyErr_SetString(PyExc_TypeError,
15095 "not all arguments converted during string formatting");
15096 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015097 }
15098
Victor Stinnera47082312012-10-04 02:19:54 +020015099 if (ctx.args_owned) {
15100 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015101 }
Victor Stinnera47082312012-10-04 02:19:54 +020015102 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015103
Benjamin Peterson29060642009-01-31 22:14:21 +000015104 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015105 _PyUnicodeWriter_Dealloc(&ctx.writer);
15106 if (ctx.args_owned) {
15107 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015108 }
15109 return NULL;
15110}
15111
Jeremy Hylton938ace62002-07-17 16:30:39 +000015112static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015113unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15114
Tim Peters6d6c1a32001-08-02 04:15:00 +000015115static PyObject *
15116unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15117{
Benjamin Peterson29060642009-01-31 22:14:21 +000015118 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 static char *kwlist[] = {"object", "encoding", "errors", 0};
15120 char *encoding = NULL;
15121 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015122
Benjamin Peterson14339b62009-01-31 16:36:08 +000015123 if (type != &PyUnicode_Type)
15124 return unicode_subtype_new(type, args, kwds);
15125 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015126 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 return NULL;
15128 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015129 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015130 if (encoding == NULL && errors == NULL)
15131 return PyObject_Str(x);
15132 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015133 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015134}
15135
Guido van Rossume023fe02001-08-30 03:12:59 +000015136static PyObject *
15137unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15138{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015139 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015140 Py_ssize_t length, char_size;
15141 int share_wstr, share_utf8;
15142 unsigned int kind;
15143 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015144
Benjamin Peterson14339b62009-01-31 16:36:08 +000015145 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015146
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015147 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015148 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015150 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015151 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015152 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015153 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015154 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015155
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015156 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015157 if (self == NULL) {
15158 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015159 return NULL;
15160 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015161 kind = PyUnicode_KIND(unicode);
15162 length = PyUnicode_GET_LENGTH(unicode);
15163
15164 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015165#ifdef Py_DEBUG
15166 _PyUnicode_HASH(self) = -1;
15167#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015168 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015169#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015170 _PyUnicode_STATE(self).interned = 0;
15171 _PyUnicode_STATE(self).kind = kind;
15172 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015173 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015174 _PyUnicode_STATE(self).ready = 1;
15175 _PyUnicode_WSTR(self) = NULL;
15176 _PyUnicode_UTF8_LENGTH(self) = 0;
15177 _PyUnicode_UTF8(self) = NULL;
15178 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015179 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015180
15181 share_utf8 = 0;
15182 share_wstr = 0;
15183 if (kind == PyUnicode_1BYTE_KIND) {
15184 char_size = 1;
15185 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15186 share_utf8 = 1;
15187 }
15188 else if (kind == PyUnicode_2BYTE_KIND) {
15189 char_size = 2;
15190 if (sizeof(wchar_t) == 2)
15191 share_wstr = 1;
15192 }
15193 else {
15194 assert(kind == PyUnicode_4BYTE_KIND);
15195 char_size = 4;
15196 if (sizeof(wchar_t) == 4)
15197 share_wstr = 1;
15198 }
15199
15200 /* Ensure we won't overflow the length. */
15201 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15202 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015203 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015204 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015205 data = PyObject_MALLOC((length + 1) * char_size);
15206 if (data == NULL) {
15207 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015208 goto onError;
15209 }
15210
Victor Stinnerc3c74152011-10-02 20:39:55 +020015211 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015212 if (share_utf8) {
15213 _PyUnicode_UTF8_LENGTH(self) = length;
15214 _PyUnicode_UTF8(self) = data;
15215 }
15216 if (share_wstr) {
15217 _PyUnicode_WSTR_LENGTH(self) = length;
15218 _PyUnicode_WSTR(self) = (wchar_t *)data;
15219 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015220
Christian Heimesf051e432016-09-13 20:22:02 +020015221 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015222 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015223 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015224#ifdef Py_DEBUG
15225 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15226#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015227 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015228 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015229
15230onError:
15231 Py_DECREF(unicode);
15232 Py_DECREF(self);
15233 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015234}
15235
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015236PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015237"str(object='') -> str\n\
15238str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015239\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015240Create a new string object from the given object. If encoding or\n\
15241errors is specified, then the object must expose a data buffer\n\
15242that will be decoded using the given encoding and error handler.\n\
15243Otherwise, returns the result of object.__str__() (if defined)\n\
15244or repr(object).\n\
15245encoding defaults to sys.getdefaultencoding().\n\
15246errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015247
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015248static PyObject *unicode_iter(PyObject *seq);
15249
Guido van Rossumd57fd912000-03-10 22:53:23 +000015250PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015251 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015252 "str", /* tp_name */
15253 sizeof(PyUnicodeObject), /* tp_basicsize */
15254 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015255 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015256 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015257 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015258 0, /* tp_getattr */
15259 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015260 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015261 unicode_repr, /* tp_repr */
15262 &unicode_as_number, /* tp_as_number */
15263 &unicode_as_sequence, /* tp_as_sequence */
15264 &unicode_as_mapping, /* tp_as_mapping */
15265 (hashfunc) unicode_hash, /* tp_hash*/
15266 0, /* tp_call*/
15267 (reprfunc) unicode_str, /* tp_str */
15268 PyObject_GenericGetAttr, /* tp_getattro */
15269 0, /* tp_setattro */
15270 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015271 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015272 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15273 unicode_doc, /* tp_doc */
15274 0, /* tp_traverse */
15275 0, /* tp_clear */
15276 PyUnicode_RichCompare, /* tp_richcompare */
15277 0, /* tp_weaklistoffset */
15278 unicode_iter, /* tp_iter */
15279 0, /* tp_iternext */
15280 unicode_methods, /* tp_methods */
15281 0, /* tp_members */
15282 0, /* tp_getset */
15283 &PyBaseObject_Type, /* tp_base */
15284 0, /* tp_dict */
15285 0, /* tp_descr_get */
15286 0, /* tp_descr_set */
15287 0, /* tp_dictoffset */
15288 0, /* tp_init */
15289 0, /* tp_alloc */
15290 unicode_new, /* tp_new */
15291 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015292};
15293
15294/* Initialize the Unicode implementation */
15295
Victor Stinner331a6a52019-05-27 16:39:22 +020015296PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015297_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015298{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015299 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015300 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015301 0x000A, /* LINE FEED */
15302 0x000D, /* CARRIAGE RETURN */
15303 0x001C, /* FILE SEPARATOR */
15304 0x001D, /* GROUP SEPARATOR */
15305 0x001E, /* RECORD SEPARATOR */
15306 0x0085, /* NEXT LINE */
15307 0x2028, /* LINE SEPARATOR */
15308 0x2029, /* PARAGRAPH SEPARATOR */
15309 };
15310
Fred Drakee4315f52000-05-09 19:53:39 +000015311 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015312 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015313 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015314 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015315 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015316 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015317
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015318 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015319 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015320 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015321
15322 /* initialize the linebreak bloom filter */
15323 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015324 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015325 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015326
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015327 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015328 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015329 }
15330 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015331 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015332 }
15333 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015334 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015335 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015336 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015337}
15338
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015339
Walter Dörwald16807132007-05-25 13:52:07 +000015340void
15341PyUnicode_InternInPlace(PyObject **p)
15342{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015343 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015345#ifdef Py_DEBUG
15346 assert(s != NULL);
15347 assert(_PyUnicode_CHECK(s));
15348#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015350 return;
15351#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 /* If it's a subclass, we don't really know what putting
15353 it in the interned dict might do. */
15354 if (!PyUnicode_CheckExact(s))
15355 return;
15356 if (PyUnicode_CHECK_INTERNED(s))
15357 return;
15358 if (interned == NULL) {
15359 interned = PyDict_New();
15360 if (interned == NULL) {
15361 PyErr_Clear(); /* Don't leave an exception */
15362 return;
15363 }
15364 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015366 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015368 if (t == NULL) {
15369 PyErr_Clear();
15370 return;
15371 }
15372 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015373 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015374 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015375 return;
15376 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 /* The two references in interned are not counted by refcnt.
15378 The deallocator will take care of this */
15379 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015380 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015381}
15382
15383void
15384PyUnicode_InternImmortal(PyObject **p)
15385{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015386 PyUnicode_InternInPlace(p);
15387 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015388 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015389 Py_INCREF(*p);
15390 }
Walter Dörwald16807132007-05-25 13:52:07 +000015391}
15392
15393PyObject *
15394PyUnicode_InternFromString(const char *cp)
15395{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015396 PyObject *s = PyUnicode_FromString(cp);
15397 if (s == NULL)
15398 return NULL;
15399 PyUnicode_InternInPlace(&s);
15400 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015401}
15402
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015403
15404#if defined(WITH_VALGRIND) || defined(__INSURE__)
15405static void
15406unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015407{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015408 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015410 }
15411 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015412 if (keys == NULL || !PyList_Check(keys)) {
15413 PyErr_Clear();
15414 return;
15415 }
Walter Dörwald16807132007-05-25 13:52:07 +000015416
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015417 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 detector, interned unicode strings are not forcibly deallocated;
15419 rather, we give them their stolen references back, and then clear
15420 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015421
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015422 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015423#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015424 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015425 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015426
15427 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015428#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015429 for (Py_ssize_t i = 0; i < n; i++) {
15430 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015431 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015432 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015433 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015434 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 case SSTATE_INTERNED_IMMORTAL:
15436 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015437#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015438 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015439#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 break;
15441 case SSTATE_INTERNED_MORTAL:
15442 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015443#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015444 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015445#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015447 case SSTATE_NOT_INTERNED:
15448 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015449 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015450 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015451 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015452 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015453 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015454#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015455 fprintf(stderr, "total size of all interned strings: "
15456 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15457 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015458#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 Py_DECREF(keys);
15460 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015461 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015462}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015463#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015464
15465
15466/********************* Unicode Iterator **************************/
15467
15468typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015469 PyObject_HEAD
15470 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015471 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015472} unicodeiterobject;
15473
15474static void
15475unicodeiter_dealloc(unicodeiterobject *it)
15476{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015477 _PyObject_GC_UNTRACK(it);
15478 Py_XDECREF(it->it_seq);
15479 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015480}
15481
15482static int
15483unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15484{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015485 Py_VISIT(it->it_seq);
15486 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015487}
15488
15489static PyObject *
15490unicodeiter_next(unicodeiterobject *it)
15491{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015492 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015493
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 assert(it != NULL);
15495 seq = it->it_seq;
15496 if (seq == NULL)
15497 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015498 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015500 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15501 int kind = PyUnicode_KIND(seq);
15502 void *data = PyUnicode_DATA(seq);
15503 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15504 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015505 if (item != NULL)
15506 ++it->it_index;
15507 return item;
15508 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015509
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015511 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015513}
15514
15515static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015516unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015517{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015518 Py_ssize_t len = 0;
15519 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015520 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015521 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015522}
15523
15524PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15525
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015526static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015527unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015528{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015529 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015530 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015531 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015532 it->it_seq, it->it_index);
15533 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015534 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015535 if (u == NULL)
15536 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015537 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015538 }
15539}
15540
15541PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15542
15543static PyObject *
15544unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15545{
15546 Py_ssize_t index = PyLong_AsSsize_t(state);
15547 if (index == -1 && PyErr_Occurred())
15548 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015549 if (it->it_seq != NULL) {
15550 if (index < 0)
15551 index = 0;
15552 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15553 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15554 it->it_index = index;
15555 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015556 Py_RETURN_NONE;
15557}
15558
15559PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15560
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015561static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015562 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015563 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015564 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15565 reduce_doc},
15566 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15567 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015568 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015569};
15570
15571PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015572 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15573 "str_iterator", /* tp_name */
15574 sizeof(unicodeiterobject), /* tp_basicsize */
15575 0, /* tp_itemsize */
15576 /* methods */
15577 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015578 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015579 0, /* tp_getattr */
15580 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015581 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015582 0, /* tp_repr */
15583 0, /* tp_as_number */
15584 0, /* tp_as_sequence */
15585 0, /* tp_as_mapping */
15586 0, /* tp_hash */
15587 0, /* tp_call */
15588 0, /* tp_str */
15589 PyObject_GenericGetAttr, /* tp_getattro */
15590 0, /* tp_setattro */
15591 0, /* tp_as_buffer */
15592 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15593 0, /* tp_doc */
15594 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15595 0, /* tp_clear */
15596 0, /* tp_richcompare */
15597 0, /* tp_weaklistoffset */
15598 PyObject_SelfIter, /* tp_iter */
15599 (iternextfunc)unicodeiter_next, /* tp_iternext */
15600 unicodeiter_methods, /* tp_methods */
15601 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015602};
15603
15604static PyObject *
15605unicode_iter(PyObject *seq)
15606{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015607 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015608
Benjamin Peterson14339b62009-01-31 16:36:08 +000015609 if (!PyUnicode_Check(seq)) {
15610 PyErr_BadInternalCall();
15611 return NULL;
15612 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015613 if (PyUnicode_READY(seq) == -1)
15614 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015615 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15616 if (it == NULL)
15617 return NULL;
15618 it->it_index = 0;
15619 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015620 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015621 _PyObject_GC_TRACK(it);
15622 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015623}
15624
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015625
15626size_t
15627Py_UNICODE_strlen(const Py_UNICODE *u)
15628{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015629 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015630}
15631
15632Py_UNICODE*
15633Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15634{
15635 Py_UNICODE *u = s1;
15636 while ((*u++ = *s2++));
15637 return s1;
15638}
15639
15640Py_UNICODE*
15641Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15642{
15643 Py_UNICODE *u = s1;
15644 while ((*u++ = *s2++))
15645 if (n-- == 0)
15646 break;
15647 return s1;
15648}
15649
15650Py_UNICODE*
15651Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15652{
15653 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015654 u1 += wcslen(u1);
15655 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015656 return s1;
15657}
15658
15659int
15660Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15661{
15662 while (*s1 && *s2 && *s1 == *s2)
15663 s1++, s2++;
15664 if (*s1 && *s2)
15665 return (*s1 < *s2) ? -1 : +1;
15666 if (*s1)
15667 return 1;
15668 if (*s2)
15669 return -1;
15670 return 0;
15671}
15672
15673int
15674Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15675{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015676 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015677 for (; n != 0; n--) {
15678 u1 = *s1;
15679 u2 = *s2;
15680 if (u1 != u2)
15681 return (u1 < u2) ? -1 : +1;
15682 if (u1 == '\0')
15683 return 0;
15684 s1++;
15685 s2++;
15686 }
15687 return 0;
15688}
15689
15690Py_UNICODE*
15691Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15692{
15693 const Py_UNICODE *p;
15694 for (p = s; *p; p++)
15695 if (*p == c)
15696 return (Py_UNICODE*)p;
15697 return NULL;
15698}
15699
15700Py_UNICODE*
15701Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15702{
15703 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015704 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015705 while (p != s) {
15706 p--;
15707 if (*p == c)
15708 return (Py_UNICODE*)p;
15709 }
15710 return NULL;
15711}
Victor Stinner331ea922010-08-10 16:37:20 +000015712
Victor Stinner71133ff2010-09-01 23:43:53 +000015713Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015714PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015715{
Victor Stinner577db2c2011-10-11 22:12:48 +020015716 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015717 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015718
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015719 if (!PyUnicode_Check(unicode)) {
15720 PyErr_BadArgument();
15721 return NULL;
15722 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015723 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015724 if (u == NULL)
15725 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015726 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015727 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015728 PyErr_NoMemory();
15729 return NULL;
15730 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015731 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015732 size *= sizeof(Py_UNICODE);
15733 copy = PyMem_Malloc(size);
15734 if (copy == NULL) {
15735 PyErr_NoMemory();
15736 return NULL;
15737 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015738 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015739 return copy;
15740}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015741
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015742
Victor Stinner709d23d2019-05-02 14:56:30 -040015743static int
15744encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015745{
Victor Stinner709d23d2019-05-02 14:56:30 -040015746 int res;
15747 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15748 if (res == -2) {
15749 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15750 return -1;
15751 }
15752 if (res < 0) {
15753 PyErr_NoMemory();
15754 return -1;
15755 }
15756 return 0;
15757}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015758
Victor Stinner709d23d2019-05-02 14:56:30 -040015759
15760static int
15761config_get_codec_name(wchar_t **config_encoding)
15762{
15763 char *encoding;
15764 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15765 return -1;
15766 }
15767
15768 PyObject *name_obj = NULL;
15769 PyObject *codec = _PyCodec_Lookup(encoding);
15770 PyMem_RawFree(encoding);
15771
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015772 if (!codec)
15773 goto error;
15774
15775 name_obj = PyObject_GetAttrString(codec, "name");
15776 Py_CLEAR(codec);
15777 if (!name_obj) {
15778 goto error;
15779 }
15780
Victor Stinner709d23d2019-05-02 14:56:30 -040015781 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15782 Py_DECREF(name_obj);
15783 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015784 goto error;
15785 }
15786
Victor Stinner709d23d2019-05-02 14:56:30 -040015787 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15788 if (raw_wname == NULL) {
15789 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015790 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015791 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015792 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015793
15794 PyMem_RawFree(*config_encoding);
15795 *config_encoding = raw_wname;
15796
15797 PyMem_Free(wname);
15798 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015799
15800error:
15801 Py_XDECREF(codec);
15802 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015803 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015804}
15805
15806
Victor Stinner331a6a52019-05-27 16:39:22 +020015807static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015808init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015809{
Victor Stinner709d23d2019-05-02 14:56:30 -040015810 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015811 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015812 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015813 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015814 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015815 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015816 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015817}
15818
15819
Victor Stinner709d23d2019-05-02 14:56:30 -040015820static int
15821init_fs_codec(PyInterpreterState *interp)
15822{
Victor Stinner331a6a52019-05-27 16:39:22 +020015823 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015824
15825 _Py_error_handler error_handler;
15826 error_handler = get_error_handler_wide(config->filesystem_errors);
15827 if (error_handler == _Py_ERROR_UNKNOWN) {
15828 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15829 return -1;
15830 }
15831
15832 char *encoding, *errors;
15833 if (encode_wstr_utf8(config->filesystem_encoding,
15834 &encoding,
15835 "filesystem_encoding") < 0) {
15836 return -1;
15837 }
15838
15839 if (encode_wstr_utf8(config->filesystem_errors,
15840 &errors,
15841 "filesystem_errors") < 0) {
15842 PyMem_RawFree(encoding);
15843 return -1;
15844 }
15845
15846 PyMem_RawFree(interp->fs_codec.encoding);
15847 interp->fs_codec.encoding = encoding;
15848 PyMem_RawFree(interp->fs_codec.errors);
15849 interp->fs_codec.errors = errors;
15850 interp->fs_codec.error_handler = error_handler;
15851
15852 /* At this point, PyUnicode_EncodeFSDefault() and
15853 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15854 the C implementation of the filesystem encoding. */
15855
15856 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15857 global configuration variables. */
15858 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15859 interp->fs_codec.errors) < 0) {
15860 PyErr_NoMemory();
15861 return -1;
15862 }
15863 return 0;
15864}
15865
15866
Victor Stinner331a6a52019-05-27 16:39:22 +020015867static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015868init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015869{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015870 PyInterpreterState *interp = tstate->interp;
15871
Victor Stinner709d23d2019-05-02 14:56:30 -040015872 /* Update the filesystem encoding to the normalized Python codec name.
15873 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15874 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015875 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015876 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015877 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015878 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015879 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015880 }
15881
Victor Stinner709d23d2019-05-02 14:56:30 -040015882 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015883 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015884 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015885 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015886}
15887
15888
Victor Stinner331a6a52019-05-27 16:39:22 +020015889PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015890_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015891{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015892 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015893 if (_PyStatus_EXCEPTION(status)) {
15894 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015895 }
15896
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015897 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015898}
15899
15900
Victor Stinner709d23d2019-05-02 14:56:30 -040015901#ifdef MS_WINDOWS
15902int
15903_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15904{
15905 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015906 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015907
15908 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15909 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15910 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15911 if (encoding == NULL || errors == NULL) {
15912 PyMem_RawFree(encoding);
15913 PyMem_RawFree(errors);
15914 PyErr_NoMemory();
15915 return -1;
15916 }
15917
15918 PyMem_RawFree(config->filesystem_encoding);
15919 config->filesystem_encoding = encoding;
15920 PyMem_RawFree(config->filesystem_errors);
15921 config->filesystem_errors = errors;
15922
15923 return init_fs_codec(interp);
15924}
15925#endif
15926
15927
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015928void
Victor Stinner3d483342019-11-22 12:27:50 +010015929_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015930{
Victor Stinner3d483342019-11-22 12:27:50 +010015931 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015932#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010015933 /* Insure++ is a memory analysis tool that aids in discovering
15934 * memory leaks and other memory problems. On Python exit, the
15935 * interned string dictionaries are flagged as being in use at exit
15936 * (which it is). Under normal circumstances, this is fine because
15937 * the memory will be automatically reclaimed by the system. Under
15938 * memory debugging, it's a huge source of useless noise, so we
15939 * trade off slower shutdown for less distraction in the memory
15940 * reports. -baw
15941 */
15942 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015943#endif /* __INSURE__ */
15944
Victor Stinner3d483342019-11-22 12:27:50 +010015945 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015946
Victor Stinner3d483342019-11-22 12:27:50 +010015947 for (Py_ssize_t i = 0; i < 256; i++) {
15948 Py_CLEAR(unicode_latin1[i]);
15949 }
15950 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015951 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015952
15953 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15954 PyMem_RawFree(interp->fs_codec.encoding);
15955 interp->fs_codec.encoding = NULL;
15956 PyMem_RawFree(interp->fs_codec.errors);
15957 interp->fs_codec.errors = NULL;
Pablo Galindo016b0282019-12-02 18:09:43 +000015958 interp->config.filesystem_errors = (wchar_t *)_Py_ERROR_UNKNOWN;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015959}
15960
15961
Georg Brandl66c221e2010-10-14 07:04:07 +000015962/* A _string module, to export formatter_parser and formatter_field_name_split
15963 to the string.Formatter class implemented in Python. */
15964
15965static PyMethodDef _string_methods[] = {
15966 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15967 METH_O, PyDoc_STR("split the argument as a field name")},
15968 {"formatter_parser", (PyCFunction) formatter_parser,
15969 METH_O, PyDoc_STR("parse the argument as a format string")},
15970 {NULL, NULL}
15971};
15972
15973static struct PyModuleDef _string_module = {
15974 PyModuleDef_HEAD_INIT,
15975 "_string",
15976 PyDoc_STR("string helper module"),
15977 0,
15978 _string_methods,
15979 NULL,
15980 NULL,
15981 NULL,
15982 NULL
15983};
15984
15985PyMODINIT_FUNC
15986PyInit__string(void)
15987{
15988 return PyModule_Create(&_string_module);
15989}
15990
15991
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015992#ifdef __cplusplus
15993}
15994#endif