blob: 5f10437a1524cc06c2b5743b8887afaefa254808 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010043#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020044#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040047#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010048#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000049#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050050#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070051#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000052
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000053#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000054#include <windows.h>
55#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000056
Victor Stinnerfecc4f22019-03-19 14:20:29 +010057/* Uncomment to display statistics on interned strings at exit when
58 using Valgrind or Insecure++. */
59/* #define INTERNED_STATS 1 */
60
61
Larry Hastings61272b72014-01-07 12:41:53 -080062/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090063class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080064[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090065/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
66
67/*[python input]
68class Py_UCS4_converter(CConverter):
69 type = 'Py_UCS4'
70 converter = 'convert_uc'
71
72 def converter_init(self):
73 if self.default is not unspecified:
74 self.c_default = ascii(self.default)
75 if len(self.c_default) > 4 or self.c_default[0] != "'":
76 self.c_default = hex(ord(self.default))
77
78[python start generated code]*/
79/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080080
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000081/* --- Globals ------------------------------------------------------------
82
Serhiy Storchaka05997252013-01-26 12:14:02 +020083NOTE: In the interpreter's initialization phase, some globals are currently
84 initialized dynamically as needed. In the process Unicode objects may
85 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000086
87*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000088
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000089
90#ifdef __cplusplus
91extern "C" {
92#endif
93
Victor Stinner8faf8212011-12-08 22:14:11 +010094/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
95#define MAX_UNICODE 0x10ffff
96
Victor Stinner910337b2011-10-03 03:20:16 +020097#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020098# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020099#else
100# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
101#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200102
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200103#define _PyUnicode_UTF8(op) \
104 (((PyCompactUnicodeObject*)(op))->utf8)
105#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200106 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200107 assert(PyUnicode_IS_READY(op)), \
108 PyUnicode_IS_COMPACT_ASCII(op) ? \
109 ((char*)((PyASCIIObject*)(op) + 1)) : \
110 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200111#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200112 (((PyCompactUnicodeObject*)(op))->utf8_length)
113#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200114 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200115 assert(PyUnicode_IS_READY(op)), \
116 PyUnicode_IS_COMPACT_ASCII(op) ? \
117 ((PyASCIIObject*)(op))->length : \
118 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200119#define _PyUnicode_WSTR(op) \
120 (((PyASCIIObject*)(op))->wstr)
121#define _PyUnicode_WSTR_LENGTH(op) \
122 (((PyCompactUnicodeObject*)(op))->wstr_length)
123#define _PyUnicode_LENGTH(op) \
124 (((PyASCIIObject *)(op))->length)
125#define _PyUnicode_STATE(op) \
126 (((PyASCIIObject *)(op))->state)
127#define _PyUnicode_HASH(op) \
128 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200129#define _PyUnicode_KIND(op) \
130 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200131 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200132#define _PyUnicode_GET_LENGTH(op) \
133 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200134 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200135#define _PyUnicode_DATA_ANY(op) \
136 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200137
Victor Stinner910337b2011-10-03 03:20:16 +0200138#undef PyUnicode_READY
139#define PyUnicode_READY(op) \
140 (assert(_PyUnicode_CHECK(op)), \
141 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200142 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100143 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200144
Victor Stinnerc379ead2011-10-03 12:52:27 +0200145#define _PyUnicode_SHARE_UTF8(op) \
146 (assert(_PyUnicode_CHECK(op)), \
147 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
148 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op) \
150 (assert(_PyUnicode_CHECK(op)), \
151 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
Victor Stinner829c0ad2011-10-03 01:08:02 +0200153/* true if the Unicode object has an allocated UTF-8 memory block
154 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200155#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200156 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200157 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200158 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
159
Victor Stinner03490912011-10-03 23:45:12 +0200160/* true if the Unicode object has an allocated wstr memory block
161 (not shared with other data) */
162#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200163 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200164 (!PyUnicode_IS_READY(op) || \
165 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
166
Victor Stinner910337b2011-10-03 03:20:16 +0200167/* Generic helper macro to convert characters of different types.
168 from_type and to_type have to be valid type names, begin and end
169 are pointers to the source characters which should be of type
170 "from_type *". to is a pointer of type "to_type *" and points to the
171 buffer where the result characters are written to. */
172#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
173 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100174 to_type *_to = (to_type *)(to); \
175 const from_type *_iter = (from_type *)(begin); \
176 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200177 Py_ssize_t n = (_end) - (_iter); \
178 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200179 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200180 while (_iter < (_unrolled_end)) { \
181 _to[0] = (to_type) _iter[0]; \
182 _to[1] = (to_type) _iter[1]; \
183 _to[2] = (to_type) _iter[2]; \
184 _to[3] = (to_type) _iter[3]; \
185 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200186 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200187 while (_iter < (_end)) \
188 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200189 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200190
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200191#ifdef MS_WINDOWS
192 /* On Windows, overallocate by 50% is the best factor */
193# define OVERALLOCATE_FACTOR 2
194#else
195 /* On Linux, overallocate by 25% is the best factor */
196# define OVERALLOCATE_FACTOR 4
197#endif
198
Walter Dörwald16807132007-05-25 13:52:07 +0000199/* This dictionary holds all interned unicode strings. Note that references
200 to strings in this dictionary are *not* counted in the string's ob_refcnt.
201 When the interned string reaches a refcnt of 0 the string deallocation
202 function will delete the reference from this dictionary.
203
204 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000205 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000206*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200207static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000208
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000209/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200210static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200211
Serhiy Storchaka678db842013-01-26 12:16:36 +0200212#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200213 do { \
214 if (unicode_empty != NULL) \
215 Py_INCREF(unicode_empty); \
216 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200217 unicode_empty = PyUnicode_New(0, 0); \
218 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200219 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200220 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000224
Serhiy Storchaka678db842013-01-26 12:16:36 +0200225#define _Py_RETURN_UNICODE_EMPTY() \
226 do { \
227 _Py_INCREF_UNICODE_EMPTY(); \
228 return unicode_empty; \
229 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000230
Victor Stinner59423e32018-11-26 13:40:01 +0100231static inline void
232unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
233 Py_ssize_t start, Py_ssize_t length)
234{
235 assert(0 <= start);
236 assert(kind != PyUnicode_WCHAR_KIND);
237 switch (kind) {
238 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100239 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100240 Py_UCS1 ch = (unsigned char)value;
241 Py_UCS1 *to = (Py_UCS1 *)data + start;
242 memset(to, ch, length);
243 break;
244 }
245 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100246 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100247 Py_UCS2 ch = (Py_UCS2)value;
248 Py_UCS2 *to = (Py_UCS2 *)data + start;
249 const Py_UCS2 *end = to + length;
250 for (; to < end; ++to) *to = ch;
251 break;
252 }
253 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100254 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100255 Py_UCS4 ch = value;
256 Py_UCS4 * to = (Py_UCS4 *)data + start;
257 const Py_UCS4 *end = to + length;
258 for (; to < end; ++to) *to = ch;
259 break;
260 }
261 default: Py_UNREACHABLE();
262 }
263}
264
265
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200266/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700267static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200268_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900269static inline void
270_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400271static PyObject *
272unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
273 const char *errors);
274static PyObject *
275unicode_decode_utf8(const char *s, Py_ssize_t size,
276 _Py_error_handler error_handler, const char *errors,
277 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200278
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200279/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200280static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200281
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282/* Single character Unicode strings in the Latin-1 range are being
283 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200284static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000285
Christian Heimes190d79e2008-01-30 11:58:22 +0000286/* Fast detection of the most frequent whitespace characters */
287const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000288 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000289/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000290/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000291/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000292/* case 0x000C: * FORM FEED */
293/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000294 0, 1, 1, 1, 1, 1, 0, 0,
295 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000296/* case 0x001C: * FILE SEPARATOR */
297/* case 0x001D: * GROUP SEPARATOR */
298/* case 0x001E: * RECORD SEPARATOR */
299/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000300 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000301/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000302 1, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000306
Benjamin Peterson14339b62009-01-31 16:36:08 +0000307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000315};
316
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200317/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200318static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200319static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100320static int unicode_modifiable(PyObject *unicode);
321
Victor Stinnerfe226c02011-10-03 03:52:20 +0200322
Alexander Belopolsky40018472011-02-26 01:02:56 +0000323static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100324_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200325static PyObject *
326_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
327static PyObject *
328_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
329
330static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000331unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000332 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100333 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000334 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
335
Alexander Belopolsky40018472011-02-26 01:02:56 +0000336static void
337raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300338 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100339 PyObject *unicode,
340 Py_ssize_t startpos, Py_ssize_t endpos,
341 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000342
Christian Heimes190d79e2008-01-30 11:58:22 +0000343/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200344static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000345 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347/* 0x000B, * LINE TABULATION */
348/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000350 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000351 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000352/* 0x001C, * FILE SEPARATOR */
353/* 0x001D, * GROUP SEPARATOR */
354/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000355 0, 0, 0, 0, 1, 1, 1, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000360
Benjamin Peterson14339b62009-01-31 16:36:08 +0000361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000369};
370
INADA Naoki3ae20562017-01-16 20:41:20 +0900371static int convert_uc(PyObject *obj, void *addr);
372
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300373#include "clinic/unicodeobject.c.h"
374
Victor Stinner3d4226a2018-08-29 22:21:32 +0200375_Py_error_handler
376_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200377{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200378 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200385 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200394 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
396 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200397 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200398 }
Victor Stinner50149202015-09-22 00:26:54 +0200399 return _Py_ERROR_OTHER;
400}
401
Victor Stinner709d23d2019-05-02 14:56:30 -0400402
403static _Py_error_handler
404get_error_handler_wide(const wchar_t *errors)
405{
406 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
407 return _Py_ERROR_STRICT;
408 }
409 if (wcscmp(errors, L"surrogateescape") == 0) {
410 return _Py_ERROR_SURROGATEESCAPE;
411 }
412 if (wcscmp(errors, L"replace") == 0) {
413 return _Py_ERROR_REPLACE;
414 }
415 if (wcscmp(errors, L"ignore") == 0) {
416 return _Py_ERROR_IGNORE;
417 }
418 if (wcscmp(errors, L"backslashreplace") == 0) {
419 return _Py_ERROR_BACKSLASHREPLACE;
420 }
421 if (wcscmp(errors, L"surrogatepass") == 0) {
422 return _Py_ERROR_SURROGATEPASS;
423 }
424 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
425 return _Py_ERROR_XMLCHARREFREPLACE;
426 }
427 return _Py_ERROR_OTHER;
428}
429
430
Victor Stinner22eb6892019-06-26 00:51:05 +0200431static inline int
432unicode_check_encoding_errors(const char *encoding, const char *errors)
433{
434 if (encoding == NULL && errors == NULL) {
435 return 0;
436 }
437
438 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
439#ifndef Py_DEBUG
440 /* In release mode, only check in development mode (-X dev) */
441 if (!interp->config.dev_mode) {
442 return 0;
443 }
444#else
445 /* Always check in debug mode */
446#endif
447
448 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
449 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
450 if (!interp->fs_codec.encoding) {
451 return 0;
452 }
453
454 if (encoding != NULL) {
455 PyObject *handler = _PyCodec_Lookup(encoding);
456 if (handler == NULL) {
457 return -1;
458 }
459 Py_DECREF(handler);
460 }
461
462 if (errors != NULL) {
463 PyObject *handler = PyCodec_LookupError(errors);
464 if (handler == NULL) {
465 return -1;
466 }
467 Py_DECREF(handler);
468 }
469 return 0;
470}
471
472
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300473/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
474 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000475Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000476PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000477{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000478#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000479 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000480#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000481 /* This is actually an illegal character, so it should
482 not be passed to unichr. */
483 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484#endif
485}
486
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200487int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100488_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200489{
Victor Stinner68762572019-10-07 18:42:01 +0200490#define CHECK(expr) \
491 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
492
Victor Stinner910337b2011-10-03 03:20:16 +0200493 PyASCIIObject *ascii;
494 unsigned int kind;
495
Victor Stinner68762572019-10-07 18:42:01 +0200496 assert(op != NULL);
497 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200498
499 ascii = (PyASCIIObject *)op;
500 kind = ascii->state.kind;
501
Victor Stinnera3b334d2011-10-03 13:53:37 +0200502 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200503 CHECK(kind == PyUnicode_1BYTE_KIND);
504 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200505 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200506 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200507 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200508 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200509
Victor Stinnera41463c2011-10-04 01:05:08 +0200510 if (ascii->state.compact == 1) {
511 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200512 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200513 || kind == PyUnicode_2BYTE_KIND
514 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200515 CHECK(ascii->state.ascii == 0);
516 CHECK(ascii->state.ready == 1);
517 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100518 }
519 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200520 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
521
522 data = unicode->data.any;
523 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200524 CHECK(ascii->length == 0);
525 CHECK(ascii->hash == -1);
526 CHECK(ascii->state.compact == 0);
527 CHECK(ascii->state.ascii == 0);
528 CHECK(ascii->state.ready == 0);
529 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
530 CHECK(ascii->wstr != NULL);
531 CHECK(data == NULL);
532 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200533 }
534 else {
Victor Stinner68762572019-10-07 18:42:01 +0200535 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200536 || kind == PyUnicode_2BYTE_KIND
537 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200538 CHECK(ascii->state.compact == 0);
539 CHECK(ascii->state.ready == 1);
540 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200541 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(compact->utf8 == data);
543 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200544 }
545 else
Victor Stinner68762572019-10-07 18:42:01 +0200546 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200547 }
548 }
549 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200550 if (
551#if SIZEOF_WCHAR_T == 2
552 kind == PyUnicode_2BYTE_KIND
553#else
554 kind == PyUnicode_4BYTE_KIND
555#endif
556 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200557 {
Victor Stinner68762572019-10-07 18:42:01 +0200558 CHECK(ascii->wstr == data);
559 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200560 } else
Victor Stinner68762572019-10-07 18:42:01 +0200561 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200562 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200563
564 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200566 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200567 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200568 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200569
570 /* check that the best kind is used: O(n) operation */
571 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200572 Py_ssize_t i;
573 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200574 void *data;
575 Py_UCS4 ch;
576
577 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200578 for (i=0; i < ascii->length; i++)
579 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200580 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200581 if (ch > maxchar)
582 maxchar = ch;
583 }
584 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100585 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200586 CHECK(maxchar >= 128);
587 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100588 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200589 else
Victor Stinner68762572019-10-07 18:42:01 +0200590 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200591 }
Victor Stinner77faf692011-11-20 18:56:05 +0100592 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 0x100);
594 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
596 else {
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar >= 0x10000);
598 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100599 }
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200601 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400602 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200603
604#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400605}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200606
Victor Stinner910337b2011-10-03 03:20:16 +0200607
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100608static PyObject*
609unicode_result_wchar(PyObject *unicode)
610{
611#ifndef Py_DEBUG
612 Py_ssize_t len;
613
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100614 len = _PyUnicode_WSTR_LENGTH(unicode);
615 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200617 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100618 }
619
620 if (len == 1) {
621 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100622 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
624 Py_DECREF(unicode);
625 return latin1_char;
626 }
627 }
628
629 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200630 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100631 return NULL;
632 }
633#else
Victor Stinneraa771272012-10-04 02:32:58 +0200634 assert(Py_REFCNT(unicode) == 1);
635
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100636 /* don't make the result ready in debug mode to ensure that the caller
637 makes the string ready before using it */
638 assert(_PyUnicode_CheckConsistency(unicode, 1));
639#endif
640 return unicode;
641}
642
643static PyObject*
644unicode_result_ready(PyObject *unicode)
645{
646 Py_ssize_t length;
647
648 length = PyUnicode_GET_LENGTH(unicode);
649 if (length == 0) {
650 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100651 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200652 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100653 }
654 return unicode_empty;
655 }
656
657 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200658 void *data = PyUnicode_DATA(unicode);
659 int kind = PyUnicode_KIND(unicode);
660 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100661 if (ch < 256) {
662 PyObject *latin1_char = unicode_latin1[ch];
663 if (latin1_char != NULL) {
664 if (unicode != latin1_char) {
665 Py_INCREF(latin1_char);
666 Py_DECREF(unicode);
667 }
668 return latin1_char;
669 }
670 else {
671 assert(_PyUnicode_CheckConsistency(unicode, 1));
672 Py_INCREF(unicode);
673 unicode_latin1[ch] = unicode;
674 return unicode;
675 }
676 }
677 }
678
679 assert(_PyUnicode_CheckConsistency(unicode, 1));
680 return unicode;
681}
682
683static PyObject*
684unicode_result(PyObject *unicode)
685{
686 assert(_PyUnicode_CHECK(unicode));
687 if (PyUnicode_IS_READY(unicode))
688 return unicode_result_ready(unicode);
689 else
690 return unicode_result_wchar(unicode);
691}
692
Victor Stinnerc4b49542011-12-11 22:44:26 +0100693static PyObject*
694unicode_result_unchanged(PyObject *unicode)
695{
696 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500697 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100698 return NULL;
699 Py_INCREF(unicode);
700 return unicode;
701 }
702 else
703 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100704 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705}
706
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200707/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
708 ASCII, Latin1, UTF-8, etc. */
709static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200710backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200711 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
712{
Victor Stinnerad771582015-10-09 12:38:53 +0200713 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714 Py_UCS4 ch;
715 enum PyUnicode_Kind kind;
716 void *data;
717
718 assert(PyUnicode_IS_READY(unicode));
719 kind = PyUnicode_KIND(unicode);
720 data = PyUnicode_DATA(unicode);
721
722 size = 0;
723 /* determine replacement size */
724 for (i = collstart; i < collend; ++i) {
725 Py_ssize_t incr;
726
727 ch = PyUnicode_READ(kind, data, i);
728 if (ch < 0x100)
729 incr = 2+2;
730 else if (ch < 0x10000)
731 incr = 2+4;
732 else {
733 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200734 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200735 }
736 if (size > PY_SSIZE_T_MAX - incr) {
737 PyErr_SetString(PyExc_OverflowError,
738 "encoded result is too long for a Python string");
739 return NULL;
740 }
741 size += incr;
742 }
743
Victor Stinnerad771582015-10-09 12:38:53 +0200744 str = _PyBytesWriter_Prepare(writer, str, size);
745 if (str == NULL)
746 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200747
748 /* generate replacement */
749 for (i = collstart; i < collend; ++i) {
750 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200751 *str++ = '\\';
752 if (ch >= 0x00010000) {
753 *str++ = 'U';
754 *str++ = Py_hexdigits[(ch>>28)&0xf];
755 *str++ = Py_hexdigits[(ch>>24)&0xf];
756 *str++ = Py_hexdigits[(ch>>20)&0xf];
757 *str++ = Py_hexdigits[(ch>>16)&0xf];
758 *str++ = Py_hexdigits[(ch>>12)&0xf];
759 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200760 }
Victor Stinner797485e2015-10-09 03:17:30 +0200761 else if (ch >= 0x100) {
762 *str++ = 'u';
763 *str++ = Py_hexdigits[(ch>>12)&0xf];
764 *str++ = Py_hexdigits[(ch>>8)&0xf];
765 }
766 else
767 *str++ = 'x';
768 *str++ = Py_hexdigits[(ch>>4)&0xf];
769 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200770 }
771 return str;
772}
773
774/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
775 ASCII, Latin1, UTF-8, etc. */
776static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200777xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200778 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
779{
Victor Stinnerad771582015-10-09 12:38:53 +0200780 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200781 Py_UCS4 ch;
782 enum PyUnicode_Kind kind;
783 void *data;
784
785 assert(PyUnicode_IS_READY(unicode));
786 kind = PyUnicode_KIND(unicode);
787 data = PyUnicode_DATA(unicode);
788
789 size = 0;
790 /* determine replacement size */
791 for (i = collstart; i < collend; ++i) {
792 Py_ssize_t incr;
793
794 ch = PyUnicode_READ(kind, data, i);
795 if (ch < 10)
796 incr = 2+1+1;
797 else if (ch < 100)
798 incr = 2+2+1;
799 else if (ch < 1000)
800 incr = 2+3+1;
801 else if (ch < 10000)
802 incr = 2+4+1;
803 else if (ch < 100000)
804 incr = 2+5+1;
805 else if (ch < 1000000)
806 incr = 2+6+1;
807 else {
808 assert(ch <= MAX_UNICODE);
809 incr = 2+7+1;
810 }
811 if (size > PY_SSIZE_T_MAX - incr) {
812 PyErr_SetString(PyExc_OverflowError,
813 "encoded result is too long for a Python string");
814 return NULL;
815 }
816 size += incr;
817 }
818
Victor Stinnerad771582015-10-09 12:38:53 +0200819 str = _PyBytesWriter_Prepare(writer, str, size);
820 if (str == NULL)
821 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200822
823 /* generate replacement */
824 for (i = collstart; i < collend; ++i) {
825 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
826 }
827 return str;
828}
829
Thomas Wouters477c8d52006-05-27 19:21:47 +0000830/* --- Bloom Filters ----------------------------------------------------- */
831
832/* stuff to implement simple "bloom filters" for Unicode characters.
833 to keep things simple, we use a single bitmask, using the least 5
834 bits from each unicode characters as the bit index. */
835
836/* the linebreak mask is set up by Unicode_Init below */
837
Antoine Pitrouf068f942010-01-13 14:19:12 +0000838#if LONG_BIT >= 128
839#define BLOOM_WIDTH 128
840#elif LONG_BIT >= 64
841#define BLOOM_WIDTH 64
842#elif LONG_BIT >= 32
843#define BLOOM_WIDTH 32
844#else
845#error "LONG_BIT is smaller than 32"
846#endif
847
Thomas Wouters477c8d52006-05-27 19:21:47 +0000848#define BLOOM_MASK unsigned long
849
Serhiy Storchaka05997252013-01-26 12:14:02 +0200850static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000851
Antoine Pitrouf068f942010-01-13 14:19:12 +0000852#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000853
Benjamin Peterson29060642009-01-31 22:14:21 +0000854#define BLOOM_LINEBREAK(ch) \
855 ((ch) < 128U ? ascii_linebreak[(ch)] : \
856 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000857
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700858static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200859make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860{
Victor Stinnera85af502013-04-09 21:53:54 +0200861#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
862 do { \
863 TYPE *data = (TYPE *)PTR; \
864 TYPE *end = data + LEN; \
865 Py_UCS4 ch; \
866 for (; data != end; data++) { \
867 ch = *data; \
868 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
869 } \
870 break; \
871 } while (0)
872
Thomas Wouters477c8d52006-05-27 19:21:47 +0000873 /* calculate simple bloom-style bitmask for a given unicode string */
874
Antoine Pitrouf068f942010-01-13 14:19:12 +0000875 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000876
877 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200878 switch (kind) {
879 case PyUnicode_1BYTE_KIND:
880 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
881 break;
882 case PyUnicode_2BYTE_KIND:
883 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
884 break;
885 case PyUnicode_4BYTE_KIND:
886 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
887 break;
888 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700889 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200890 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000891 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200892
893#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000894}
895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300896static int
897ensure_unicode(PyObject *obj)
898{
899 if (!PyUnicode_Check(obj)) {
900 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200901 "must be str, not %.100s",
902 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903 return -1;
904 }
905 return PyUnicode_READY(obj);
906}
907
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200908/* Compilation of templated routines */
909
910#include "stringlib/asciilib.h"
911#include "stringlib/fastsearch.h"
912#include "stringlib/partition.h"
913#include "stringlib/split.h"
914#include "stringlib/count.h"
915#include "stringlib/find.h"
916#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200917#include "stringlib/undef.h"
918
919#include "stringlib/ucs1lib.h"
920#include "stringlib/fastsearch.h"
921#include "stringlib/partition.h"
922#include "stringlib/split.h"
923#include "stringlib/count.h"
924#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300925#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200926#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200927#include "stringlib/undef.h"
928
929#include "stringlib/ucs2lib.h"
930#include "stringlib/fastsearch.h"
931#include "stringlib/partition.h"
932#include "stringlib/split.h"
933#include "stringlib/count.h"
934#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300935#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200936#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200937#include "stringlib/undef.h"
938
939#include "stringlib/ucs4lib.h"
940#include "stringlib/fastsearch.h"
941#include "stringlib/partition.h"
942#include "stringlib/split.h"
943#include "stringlib/count.h"
944#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300945#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200946#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200947#include "stringlib/undef.h"
948
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200949#include "stringlib/unicodedefs.h"
950#include "stringlib/fastsearch.h"
951#include "stringlib/count.h"
952#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100953#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200954
Guido van Rossumd57fd912000-03-10 22:53:23 +0000955/* --- Unicode Object ----------------------------------------------------- */
956
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700957static inline Py_ssize_t
958findchar(const void *s, int kind,
959 Py_ssize_t size, Py_UCS4 ch,
960 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200961{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200962 switch (kind) {
963 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200964 if ((Py_UCS1) ch != ch)
965 return -1;
966 if (direction > 0)
967 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
968 else
969 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200970 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS2) ch != ch)
972 return -1;
973 if (direction > 0)
974 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
975 else
976 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if (direction > 0)
979 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
980 else
981 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200982 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700983 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200984 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200985}
986
Victor Stinnerafffce42012-10-03 23:03:17 +0200987#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000988/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200989 earlier.
990
991 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
992 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
993 invalid character in Unicode 6.0. */
994static void
995unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
996{
997 int kind = PyUnicode_KIND(unicode);
998 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
999 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1000 if (length <= old_length)
1001 return;
1002 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1003}
1004#endif
1005
Victor Stinnerfe226c02011-10-03 03:52:20 +02001006static PyObject*
1007resize_compact(PyObject *unicode, Py_ssize_t length)
1008{
1009 Py_ssize_t char_size;
1010 Py_ssize_t struct_size;
1011 Py_ssize_t new_size;
1012 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001013 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001014#ifdef Py_DEBUG
1015 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1016#endif
1017
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001020 assert(PyUnicode_IS_COMPACT(unicode));
1021
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001022 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001023 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001024 struct_size = sizeof(PyASCIIObject);
1025 else
1026 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001027 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028
Victor Stinnerfe226c02011-10-03 03:52:20 +02001029 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1030 PyErr_NoMemory();
1031 return NULL;
1032 }
1033 new_size = (struct_size + (length + 1) * char_size);
1034
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001035 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1036 PyObject_DEL(_PyUnicode_UTF8(unicode));
1037 _PyUnicode_UTF8(unicode) = NULL;
1038 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1039 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001040#ifdef Py_REF_DEBUG
1041 _Py_RefTotal--;
1042#endif
1043#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001044 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001045#endif
Victor Stinner84def372011-12-11 20:04:56 +01001046
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001047 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001048 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001049 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001050 PyErr_NoMemory();
1051 return NULL;
1052 }
Victor Stinner84def372011-12-11 20:04:56 +01001053 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001054 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001057 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001058 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001059 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001060 _PyUnicode_WSTR_LENGTH(unicode) = length;
1061 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001062 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1063 PyObject_DEL(_PyUnicode_WSTR(unicode));
1064 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001065 if (!PyUnicode_IS_ASCII(unicode))
1066 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001067 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001068#ifdef Py_DEBUG
1069 unicode_fill_invalid(unicode, old_length);
1070#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001071 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1072 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001073 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001074 return unicode;
1075}
1076
Alexander Belopolsky40018472011-02-26 01:02:56 +00001077static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001078resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001079{
Victor Stinner95663112011-10-04 01:03:50 +02001080 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001081 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001082 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001084
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 if (PyUnicode_IS_READY(unicode)) {
1086 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001087 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001088 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001089#ifdef Py_DEBUG
1090 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1091#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092
1093 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001094 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001095 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1096 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001097
1098 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1099 PyErr_NoMemory();
1100 return -1;
1101 }
1102 new_size = (length + 1) * char_size;
1103
Victor Stinner7a9105a2011-12-12 00:13:42 +01001104 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1105 {
1106 PyObject_DEL(_PyUnicode_UTF8(unicode));
1107 _PyUnicode_UTF8(unicode) = NULL;
1108 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1109 }
1110
Victor Stinnerfe226c02011-10-03 03:52:20 +02001111 data = (PyObject *)PyObject_REALLOC(data, new_size);
1112 if (data == NULL) {
1113 PyErr_NoMemory();
1114 return -1;
1115 }
1116 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001117 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001119 _PyUnicode_WSTR_LENGTH(unicode) = length;
1120 }
1121 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001122 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001123 _PyUnicode_UTF8_LENGTH(unicode) = length;
1124 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 _PyUnicode_LENGTH(unicode) = length;
1126 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001127#ifdef Py_DEBUG
1128 unicode_fill_invalid(unicode, old_length);
1129#endif
Victor Stinner95663112011-10-04 01:03:50 +02001130 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001131 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001133 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001134 }
Victor Stinner95663112011-10-04 01:03:50 +02001135 assert(_PyUnicode_WSTR(unicode) != NULL);
1136
1137 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001138 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001139 PyErr_NoMemory();
1140 return -1;
1141 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001142 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001143 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001144 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001145 if (!wstr) {
1146 PyErr_NoMemory();
1147 return -1;
1148 }
1149 _PyUnicode_WSTR(unicode) = wstr;
1150 _PyUnicode_WSTR(unicode)[length] = 0;
1151 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001152 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001153 return 0;
1154}
1155
Victor Stinnerfe226c02011-10-03 03:52:20 +02001156static PyObject*
1157resize_copy(PyObject *unicode, Py_ssize_t length)
1158{
1159 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001160 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001161 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001162
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001163 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001164
1165 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1166 if (copy == NULL)
1167 return NULL;
1168
1169 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001170 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001172 }
1173 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001174 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001175
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001176 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001177 if (w == NULL)
1178 return NULL;
1179 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1180 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001181 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001182 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001183 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 }
1185}
1186
Guido van Rossumd57fd912000-03-10 22:53:23 +00001187/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001188 Ux0000 terminated; some code (e.g. new_identifier)
1189 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001190
1191 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001192 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001193
1194*/
1195
Alexander Belopolsky40018472011-02-26 01:02:56 +00001196static PyUnicodeObject *
1197_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001199 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001201
Thomas Wouters477c8d52006-05-27 19:21:47 +00001202 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001203 if (length == 0 && unicode_empty != NULL) {
1204 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001205 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001206 }
1207
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001208 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001209 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001210 return (PyUnicodeObject *)PyErr_NoMemory();
1211 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001212 if (length < 0) {
1213 PyErr_SetString(PyExc_SystemError,
1214 "Negative size passed to _PyUnicode_New");
1215 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001216 }
1217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001218 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1219 if (unicode == NULL)
1220 return NULL;
1221 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001222
1223 _PyUnicode_WSTR_LENGTH(unicode) = length;
1224 _PyUnicode_HASH(unicode) = -1;
1225 _PyUnicode_STATE(unicode).interned = 0;
1226 _PyUnicode_STATE(unicode).kind = 0;
1227 _PyUnicode_STATE(unicode).compact = 0;
1228 _PyUnicode_STATE(unicode).ready = 0;
1229 _PyUnicode_STATE(unicode).ascii = 0;
1230 _PyUnicode_DATA_ANY(unicode) = NULL;
1231 _PyUnicode_LENGTH(unicode) = 0;
1232 _PyUnicode_UTF8(unicode) = NULL;
1233 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1234
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001235 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1236 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001237 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001238 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001239 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001240 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241
Jeremy Hyltond8082792003-09-16 19:41:39 +00001242 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001243 * the caller fails before initializing str -- unicode_resize()
1244 * reads str[0], and the Keep-Alive optimization can keep memory
1245 * allocated for str alive across a call to unicode_dealloc(unicode).
1246 * We don't want unicode_resize to read uninitialized memory in
1247 * that case.
1248 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001249 _PyUnicode_WSTR(unicode)[0] = 0;
1250 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001251
Victor Stinner7931d9a2011-11-04 00:22:48 +01001252 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001253 return unicode;
1254}
1255
Victor Stinnerf42dc442011-10-02 23:33:16 +02001256static const char*
1257unicode_kind_name(PyObject *unicode)
1258{
Victor Stinner42dfd712011-10-03 14:41:45 +02001259 /* don't check consistency: unicode_kind_name() is called from
1260 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001261 if (!PyUnicode_IS_COMPACT(unicode))
1262 {
1263 if (!PyUnicode_IS_READY(unicode))
1264 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001265 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001266 {
1267 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001268 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001269 return "legacy ascii";
1270 else
1271 return "legacy latin1";
1272 case PyUnicode_2BYTE_KIND:
1273 return "legacy UCS2";
1274 case PyUnicode_4BYTE_KIND:
1275 return "legacy UCS4";
1276 default:
1277 return "<legacy invalid kind>";
1278 }
1279 }
1280 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001281 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001282 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001283 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001284 return "ascii";
1285 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001286 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001287 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001288 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001290 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001291 default:
1292 return "<invalid compact kind>";
1293 }
1294}
1295
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001297/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001298char *_PyUnicode_utf8(void *unicode_raw){
1299 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001300 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001301}
1302
Victor Stinnera42de742018-11-22 10:25:22 +01001303void *_PyUnicode_compact_data(void *unicode_raw) {
1304 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001305 return _PyUnicode_COMPACT_DATA(unicode);
1306}
Victor Stinnera42de742018-11-22 10:25:22 +01001307void *_PyUnicode_data(void *unicode_raw) {
1308 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001309 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001310 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1311 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1312 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1313 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1314 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1315 return PyUnicode_DATA(unicode);
1316}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001317
1318void
1319_PyUnicode_Dump(PyObject *op)
1320{
1321 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001322 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1323 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1324 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001325
Victor Stinnera849a4b2011-10-03 12:12:11 +02001326 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001327 {
1328 if (ascii->state.ascii)
1329 data = (ascii + 1);
1330 else
1331 data = (compact + 1);
1332 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 else
1334 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001335 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1336 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001337
Victor Stinnera849a4b2011-10-03 12:12:11 +02001338 if (ascii->wstr == data)
1339 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001340 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001341
Victor Stinnera3b334d2011-10-03 13:53:37 +02001342 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001343 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001344 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1345 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001346 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001347 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001348 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001349 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001350}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001351#endif
1352
1353PyObject *
1354PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1355{
1356 PyObject *obj;
1357 PyCompactUnicodeObject *unicode;
1358 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001359 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001360 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001361 Py_ssize_t char_size;
1362 Py_ssize_t struct_size;
1363
1364 /* Optimization for empty strings */
1365 if (size == 0 && unicode_empty != NULL) {
1366 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001367 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 }
1369
Victor Stinner9e9d6892011-10-04 01:02:02 +02001370 is_ascii = 0;
1371 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001372 struct_size = sizeof(PyCompactUnicodeObject);
1373 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001374 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 char_size = 1;
1376 is_ascii = 1;
1377 struct_size = sizeof(PyASCIIObject);
1378 }
1379 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001380 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 char_size = 1;
1382 }
1383 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001384 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001385 char_size = 2;
1386 if (sizeof(wchar_t) == 2)
1387 is_sharing = 1;
1388 }
1389 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001390 if (maxchar > MAX_UNICODE) {
1391 PyErr_SetString(PyExc_SystemError,
1392 "invalid maximum character passed to PyUnicode_New");
1393 return NULL;
1394 }
Victor Stinner8f825062012-04-27 13:55:39 +02001395 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001396 char_size = 4;
1397 if (sizeof(wchar_t) == 4)
1398 is_sharing = 1;
1399 }
1400
1401 /* Ensure we won't overflow the size. */
1402 if (size < 0) {
1403 PyErr_SetString(PyExc_SystemError,
1404 "Negative size passed to PyUnicode_New");
1405 return NULL;
1406 }
1407 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1408 return PyErr_NoMemory();
1409
1410 /* Duplicated allocation code from _PyObject_New() instead of a call to
1411 * PyObject_New() so we are able to allocate space for the object and
1412 * it's data buffer.
1413 */
1414 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1415 if (obj == NULL)
1416 return PyErr_NoMemory();
1417 obj = PyObject_INIT(obj, &PyUnicode_Type);
1418 if (obj == NULL)
1419 return NULL;
1420
1421 unicode = (PyCompactUnicodeObject *)obj;
1422 if (is_ascii)
1423 data = ((PyASCIIObject*)obj) + 1;
1424 else
1425 data = unicode + 1;
1426 _PyUnicode_LENGTH(unicode) = size;
1427 _PyUnicode_HASH(unicode) = -1;
1428 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001429 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001430 _PyUnicode_STATE(unicode).compact = 1;
1431 _PyUnicode_STATE(unicode).ready = 1;
1432 _PyUnicode_STATE(unicode).ascii = is_ascii;
1433 if (is_ascii) {
1434 ((char*)data)[size] = 0;
1435 _PyUnicode_WSTR(unicode) = NULL;
1436 }
Victor Stinner8f825062012-04-27 13:55:39 +02001437 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001438 ((char*)data)[size] = 0;
1439 _PyUnicode_WSTR(unicode) = NULL;
1440 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001441 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001442 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001443 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001444 else {
1445 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001446 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001447 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001449 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001450 ((Py_UCS4*)data)[size] = 0;
1451 if (is_sharing) {
1452 _PyUnicode_WSTR_LENGTH(unicode) = size;
1453 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1454 }
1455 else {
1456 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1457 _PyUnicode_WSTR(unicode) = NULL;
1458 }
1459 }
Victor Stinner8f825062012-04-27 13:55:39 +02001460#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001461 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001462#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001463 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001464 return obj;
1465}
1466
1467#if SIZEOF_WCHAR_T == 2
1468/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1469 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001470 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471
1472 This function assumes that unicode can hold one more code point than wstr
1473 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001474static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001475unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001476 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477{
1478 const wchar_t *iter;
1479 Py_UCS4 *ucs4_out;
1480
Victor Stinner910337b2011-10-03 03:20:16 +02001481 assert(unicode != NULL);
1482 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1484 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1485
1486 for (iter = begin; iter < end; ) {
1487 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1488 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001489 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1490 && (iter+1) < end
1491 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001492 {
Victor Stinner551ac952011-11-29 22:58:13 +01001493 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001494 iter += 2;
1495 }
1496 else {
1497 *ucs4_out++ = *iter;
1498 iter++;
1499 }
1500 }
1501 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1502 _PyUnicode_GET_LENGTH(unicode)));
1503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001504}
1505#endif
1506
Victor Stinnercd9950f2011-10-02 00:34:53 +02001507static int
Victor Stinner488fa492011-12-12 00:01:39 +01001508unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001509{
Victor Stinner488fa492011-12-12 00:01:39 +01001510 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001511 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001512 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001513 return -1;
1514 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001515 return 0;
1516}
1517
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001518static int
1519_copy_characters(PyObject *to, Py_ssize_t to_start,
1520 PyObject *from, Py_ssize_t from_start,
1521 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001522{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001523 unsigned int from_kind, to_kind;
1524 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525
Victor Stinneree4544c2012-05-09 22:24:08 +02001526 assert(0 <= how_many);
1527 assert(0 <= from_start);
1528 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001529 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001530 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001531 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532
Victor Stinnerd3f08822012-05-29 12:57:52 +02001533 assert(PyUnicode_Check(to));
1534 assert(PyUnicode_IS_READY(to));
1535 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1536
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001537 if (how_many == 0)
1538 return 0;
1539
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001540 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001541 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001542 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001543 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001544
Victor Stinnerf1852262012-06-16 16:38:26 +02001545#ifdef Py_DEBUG
1546 if (!check_maxchar
1547 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1548 {
1549 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1550 Py_UCS4 ch;
1551 Py_ssize_t i;
1552 for (i=0; i < how_many; i++) {
1553 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1554 assert(ch <= to_maxchar);
1555 }
1556 }
1557#endif
1558
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001560 if (check_maxchar
1561 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1562 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001563 /* Writing Latin-1 characters into an ASCII string requires to
1564 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001565 Py_UCS4 max_char;
1566 max_char = ucs1lib_find_max_char(from_data,
1567 (Py_UCS1*)from_data + how_many);
1568 if (max_char >= 128)
1569 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 }
Christian Heimesf051e432016-09-13 20:22:02 +02001571 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001572 (char*)from_data + from_kind * from_start,
1573 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001574 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001575 else if (from_kind == PyUnicode_1BYTE_KIND
1576 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001577 {
1578 _PyUnicode_CONVERT_BYTES(
1579 Py_UCS1, Py_UCS2,
1580 PyUnicode_1BYTE_DATA(from) + from_start,
1581 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1582 PyUnicode_2BYTE_DATA(to) + to_start
1583 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001584 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001585 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001586 && to_kind == PyUnicode_4BYTE_KIND)
1587 {
1588 _PyUnicode_CONVERT_BYTES(
1589 Py_UCS1, Py_UCS4,
1590 PyUnicode_1BYTE_DATA(from) + from_start,
1591 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1592 PyUnicode_4BYTE_DATA(to) + to_start
1593 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001594 }
1595 else if (from_kind == PyUnicode_2BYTE_KIND
1596 && to_kind == PyUnicode_4BYTE_KIND)
1597 {
1598 _PyUnicode_CONVERT_BYTES(
1599 Py_UCS2, Py_UCS4,
1600 PyUnicode_2BYTE_DATA(from) + from_start,
1601 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1602 PyUnicode_4BYTE_DATA(to) + to_start
1603 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001604 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001605 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001606 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1607
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001608 if (!check_maxchar) {
1609 if (from_kind == PyUnicode_2BYTE_KIND
1610 && to_kind == PyUnicode_1BYTE_KIND)
1611 {
1612 _PyUnicode_CONVERT_BYTES(
1613 Py_UCS2, Py_UCS1,
1614 PyUnicode_2BYTE_DATA(from) + from_start,
1615 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1616 PyUnicode_1BYTE_DATA(to) + to_start
1617 );
1618 }
1619 else if (from_kind == PyUnicode_4BYTE_KIND
1620 && to_kind == PyUnicode_1BYTE_KIND)
1621 {
1622 _PyUnicode_CONVERT_BYTES(
1623 Py_UCS4, Py_UCS1,
1624 PyUnicode_4BYTE_DATA(from) + from_start,
1625 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1626 PyUnicode_1BYTE_DATA(to) + to_start
1627 );
1628 }
1629 else if (from_kind == PyUnicode_4BYTE_KIND
1630 && to_kind == PyUnicode_2BYTE_KIND)
1631 {
1632 _PyUnicode_CONVERT_BYTES(
1633 Py_UCS4, Py_UCS2,
1634 PyUnicode_4BYTE_DATA(from) + from_start,
1635 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1636 PyUnicode_2BYTE_DATA(to) + to_start
1637 );
1638 }
1639 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001640 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001641 }
1642 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001643 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001644 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001645 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001646 Py_ssize_t i;
1647
Victor Stinnera0702ab2011-09-29 14:14:38 +02001648 for (i=0; i < how_many; i++) {
1649 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001650 if (ch > to_maxchar)
1651 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001652 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1653 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001654 }
1655 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001656 return 0;
1657}
1658
Victor Stinnerd3f08822012-05-29 12:57:52 +02001659void
1660_PyUnicode_FastCopyCharacters(
1661 PyObject *to, Py_ssize_t to_start,
1662 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001663{
1664 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1665}
1666
1667Py_ssize_t
1668PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1669 PyObject *from, Py_ssize_t from_start,
1670 Py_ssize_t how_many)
1671{
1672 int err;
1673
1674 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1675 PyErr_BadInternalCall();
1676 return -1;
1677 }
1678
Benjamin Petersonbac79492012-01-14 13:34:47 -05001679 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001680 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001681 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001682 return -1;
1683
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001684 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001685 PyErr_SetString(PyExc_IndexError, "string index out of range");
1686 return -1;
1687 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001688 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001689 PyErr_SetString(PyExc_IndexError, "string index out of range");
1690 return -1;
1691 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001692 if (how_many < 0) {
1693 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1694 return -1;
1695 }
1696 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001697 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1698 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001699 "Cannot write %zi characters at %zi "
1700 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001701 how_many, to_start, PyUnicode_GET_LENGTH(to));
1702 return -1;
1703 }
1704
1705 if (how_many == 0)
1706 return 0;
1707
Victor Stinner488fa492011-12-12 00:01:39 +01001708 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001709 return -1;
1710
1711 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1712 if (err) {
1713 PyErr_Format(PyExc_SystemError,
1714 "Cannot copy %s characters "
1715 "into a string of %s characters",
1716 unicode_kind_name(from),
1717 unicode_kind_name(to));
1718 return -1;
1719 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001720 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721}
1722
Victor Stinner17222162011-09-28 22:15:37 +02001723/* Find the maximum code point and count the number of surrogate pairs so a
1724 correct string length can be computed before converting a string to UCS4.
1725 This function counts single surrogates as a character and not as a pair.
1726
1727 Return 0 on success, or -1 on error. */
1728static int
1729find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1730 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001731{
1732 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001733 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Victor Stinnerc53be962011-10-02 21:33:54 +02001735 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001736 *num_surrogates = 0;
1737 *maxchar = 0;
1738
1739 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001741 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1742 && (iter+1) < end
1743 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1744 {
1745 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1746 ++(*num_surrogates);
1747 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001748 }
1749 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001750#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001751 {
1752 ch = *iter;
1753 iter++;
1754 }
1755 if (ch > *maxchar) {
1756 *maxchar = ch;
1757 if (*maxchar > MAX_UNICODE) {
1758 PyErr_Format(PyExc_ValueError,
1759 "character U+%x is not in range [U+0000; U+10ffff]",
1760 ch);
1761 return -1;
1762 }
1763 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001764 }
1765 return 0;
1766}
1767
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001768int
1769_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001770{
1771 wchar_t *end;
1772 Py_UCS4 maxchar = 0;
1773 Py_ssize_t num_surrogates;
1774#if SIZEOF_WCHAR_T == 2
1775 Py_ssize_t length_wo_surrogates;
1776#endif
1777
Georg Brandl7597add2011-10-05 16:36:47 +02001778 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001779 strings were created using _PyObject_New() and where no canonical
1780 representation (the str field) has been set yet aka strings
1781 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001782 assert(_PyUnicode_CHECK(unicode));
1783 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001784 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001785 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001786 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001787 /* Actually, it should neither be interned nor be anything else: */
1788 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001789
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001790 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001791 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001792 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001794
1795 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001796 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1797 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001798 PyErr_NoMemory();
1799 return -1;
1800 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001801 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001802 _PyUnicode_WSTR(unicode), end,
1803 PyUnicode_1BYTE_DATA(unicode));
1804 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1805 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1806 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1807 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001808 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001809 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001810 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001811 }
1812 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001813 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001814 _PyUnicode_UTF8(unicode) = NULL;
1815 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001816 }
1817 PyObject_FREE(_PyUnicode_WSTR(unicode));
1818 _PyUnicode_WSTR(unicode) = NULL;
1819 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1820 }
1821 /* In this case we might have to convert down from 4-byte native
1822 wchar_t to 2-byte unicode. */
1823 else if (maxchar < 65536) {
1824 assert(num_surrogates == 0 &&
1825 "FindMaxCharAndNumSurrogatePairs() messed up");
1826
Victor Stinner506f5922011-09-28 22:34:18 +02001827#if SIZEOF_WCHAR_T == 2
1828 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001830 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1831 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1832 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001833 _PyUnicode_UTF8(unicode) = NULL;
1834 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001835#else
1836 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001837 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001838 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001839 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001840 PyErr_NoMemory();
1841 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001842 }
Victor Stinner506f5922011-09-28 22:34:18 +02001843 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1844 _PyUnicode_WSTR(unicode), end,
1845 PyUnicode_2BYTE_DATA(unicode));
1846 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1847 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1848 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001849 _PyUnicode_UTF8(unicode) = NULL;
1850 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001851 PyObject_FREE(_PyUnicode_WSTR(unicode));
1852 _PyUnicode_WSTR(unicode) = NULL;
1853 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1854#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001855 }
1856 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1857 else {
1858#if SIZEOF_WCHAR_T == 2
1859 /* in case the native representation is 2-bytes, we need to allocate a
1860 new normalized 4-byte version. */
1861 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001862 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1863 PyErr_NoMemory();
1864 return -1;
1865 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001866 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1867 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001868 PyErr_NoMemory();
1869 return -1;
1870 }
1871 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1872 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001873 _PyUnicode_UTF8(unicode) = NULL;
1874 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001875 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1876 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001877 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001878 PyObject_FREE(_PyUnicode_WSTR(unicode));
1879 _PyUnicode_WSTR(unicode) = NULL;
1880 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1881#else
1882 assert(num_surrogates == 0);
1883
Victor Stinnerc3c74152011-10-02 20:39:55 +02001884 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001886 _PyUnicode_UTF8(unicode) = NULL;
1887 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001888 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1889#endif
1890 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1891 }
1892 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001893 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001894 return 0;
1895}
1896
Alexander Belopolsky40018472011-02-26 01:02:56 +00001897static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001898unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001899{
Walter Dörwald16807132007-05-25 13:52:07 +00001900 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001901 case SSTATE_NOT_INTERNED:
1902 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001903
Benjamin Peterson29060642009-01-31 22:14:21 +00001904 case SSTATE_INTERNED_MORTAL:
1905 /* revive dead object temporarily for DelItem */
1906 Py_REFCNT(unicode) = 3;
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001907 if (PyDict_DelItem(interned, unicode) != 0) {
1908 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1909 NULL);
1910 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001912
Benjamin Peterson29060642009-01-31 22:14:21 +00001913 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001914 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1915 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001916
Benjamin Peterson29060642009-01-31 22:14:21 +00001917 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001918 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001919 }
1920
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001921 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001922 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001923 }
1924 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001925 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001926 }
1927 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001928 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001929 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001930
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001931 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001932}
1933
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001934#ifdef Py_DEBUG
1935static int
1936unicode_is_singleton(PyObject *unicode)
1937{
1938 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1939 if (unicode == unicode_empty)
1940 return 1;
1941 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1942 {
1943 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1944 if (ch < 256 && unicode_latin1[ch] == unicode)
1945 return 1;
1946 }
1947 return 0;
1948}
1949#endif
1950
Alexander Belopolsky40018472011-02-26 01:02:56 +00001951static int
Victor Stinner488fa492011-12-12 00:01:39 +01001952unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001953{
Victor Stinner488fa492011-12-12 00:01:39 +01001954 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001955 if (Py_REFCNT(unicode) != 1)
1956 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001957 if (_PyUnicode_HASH(unicode) != -1)
1958 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001959 if (PyUnicode_CHECK_INTERNED(unicode))
1960 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001961 if (!PyUnicode_CheckExact(unicode))
1962 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001963#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001964 /* singleton refcount is greater than 1 */
1965 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001966#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 return 1;
1968}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001969
Victor Stinnerfe226c02011-10-03 03:52:20 +02001970static int
1971unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1972{
1973 PyObject *unicode;
1974 Py_ssize_t old_length;
1975
1976 assert(p_unicode != NULL);
1977 unicode = *p_unicode;
1978
1979 assert(unicode != NULL);
1980 assert(PyUnicode_Check(unicode));
1981 assert(0 <= length);
1982
Victor Stinner910337b2011-10-03 03:20:16 +02001983 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001984 old_length = PyUnicode_WSTR_LENGTH(unicode);
1985 else
1986 old_length = PyUnicode_GET_LENGTH(unicode);
1987 if (old_length == length)
1988 return 0;
1989
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001990 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001991 _Py_INCREF_UNICODE_EMPTY();
1992 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001993 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001994 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001995 return 0;
1996 }
1997
Victor Stinner488fa492011-12-12 00:01:39 +01001998 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001999 PyObject *copy = resize_copy(unicode, length);
2000 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002001 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002002 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002003 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002004 }
2005
Victor Stinnerfe226c02011-10-03 03:52:20 +02002006 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002007 PyObject *new_unicode = resize_compact(unicode, length);
2008 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002009 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002010 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002011 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002012 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002013 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002014}
2015
Alexander Belopolsky40018472011-02-26 01:02:56 +00002016int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002017PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002018{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002019 PyObject *unicode;
2020 if (p_unicode == NULL) {
2021 PyErr_BadInternalCall();
2022 return -1;
2023 }
2024 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002025 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026 {
2027 PyErr_BadInternalCall();
2028 return -1;
2029 }
2030 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002031}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002032
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002033/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002034
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002035 WARNING: The function doesn't copy the terminating null character and
2036 doesn't check the maximum character (may write a latin1 character in an
2037 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002038static void
2039unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2040 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002041{
2042 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2043 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002044 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002045
2046 switch (kind) {
2047 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002048 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02002049#ifdef Py_DEBUG
2050 if (PyUnicode_IS_ASCII(unicode)) {
2051 Py_UCS4 maxchar = ucs1lib_find_max_char(
2052 (const Py_UCS1*)str,
2053 (const Py_UCS1*)str + len);
2054 assert(maxchar < 128);
2055 }
2056#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002057 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002058 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002059 }
2060 case PyUnicode_2BYTE_KIND: {
2061 Py_UCS2 *start = (Py_UCS2 *)data + index;
2062 Py_UCS2 *ucs2 = start;
2063 assert(index <= PyUnicode_GET_LENGTH(unicode));
2064
Victor Stinner184252a2012-06-16 02:57:41 +02002065 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 *ucs2 = (Py_UCS2)*str;
2067
2068 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002069 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002070 }
2071 default: {
2072 Py_UCS4 *start = (Py_UCS4 *)data + index;
2073 Py_UCS4 *ucs4 = start;
2074 assert(kind == PyUnicode_4BYTE_KIND);
2075 assert(index <= PyUnicode_GET_LENGTH(unicode));
2076
Victor Stinner184252a2012-06-16 02:57:41 +02002077 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002078 *ucs4 = (Py_UCS4)*str;
2079
2080 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002081 }
2082 }
2083}
2084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085static PyObject*
2086get_latin1_char(unsigned char ch)
2087{
Victor Stinnera464fc12011-10-02 20:39:30 +02002088 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002089 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002090 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 if (!unicode)
2092 return NULL;
2093 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002094 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002095 unicode_latin1[ch] = unicode;
2096 }
2097 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002098 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002099}
2100
Victor Stinner985a82a2014-01-03 12:53:47 +01002101static PyObject*
2102unicode_char(Py_UCS4 ch)
2103{
2104 PyObject *unicode;
2105
2106 assert(ch <= MAX_UNICODE);
2107
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002108 if (ch < 256)
2109 return get_latin1_char(ch);
2110
Victor Stinner985a82a2014-01-03 12:53:47 +01002111 unicode = PyUnicode_New(1, ch);
2112 if (unicode == NULL)
2113 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002114
2115 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2116 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002117 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002118 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002119 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2120 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2121 }
2122 assert(_PyUnicode_CheckConsistency(unicode, 1));
2123 return unicode;
2124}
2125
Alexander Belopolsky40018472011-02-26 01:02:56 +00002126PyObject *
2127PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002128{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002129 if (u == NULL)
2130 return (PyObject*)_PyUnicode_New(size);
2131
2132 if (size < 0) {
2133 PyErr_BadInternalCall();
2134 return NULL;
2135 }
2136
2137 return PyUnicode_FromWideChar(u, size);
2138}
2139
2140PyObject *
2141PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2142{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002143 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 Py_UCS4 maxchar = 0;
2145 Py_ssize_t num_surrogates;
2146
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002147 if (u == NULL && size != 0) {
2148 PyErr_BadInternalCall();
2149 return NULL;
2150 }
2151
2152 if (size == -1) {
2153 size = wcslen(u);
2154 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002155
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002156 /* If the Unicode data is known at construction time, we can apply
2157 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002158
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002159 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002160 if (size == 0)
2161 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002163 /* Single character Unicode objects in the Latin-1 range are
2164 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002165 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 return get_latin1_char((unsigned char)*u);
2167
2168 /* If not empty and not single character, copy the Unicode data
2169 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002170 if (find_maxchar_surrogates(u, u + size,
2171 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002172 return NULL;
2173
Victor Stinner8faf8212011-12-08 22:14:11 +01002174 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002175 if (!unicode)
2176 return NULL;
2177
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002178 switch (PyUnicode_KIND(unicode)) {
2179 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002180 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002181 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2182 break;
2183 case PyUnicode_2BYTE_KIND:
2184#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002185 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002186#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002187 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2189#endif
2190 break;
2191 case PyUnicode_4BYTE_KIND:
2192#if SIZEOF_WCHAR_T == 2
2193 /* This is the only case which has to process surrogates, thus
2194 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002195 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002196#else
2197 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002198 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002199#endif
2200 break;
2201 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002202 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002204
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002206}
2207
Alexander Belopolsky40018472011-02-26 01:02:56 +00002208PyObject *
2209PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002210{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002211 if (size < 0) {
2212 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002213 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002214 return NULL;
2215 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002216 if (u != NULL)
2217 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2218 else
2219 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002220}
2221
Alexander Belopolsky40018472011-02-26 01:02:56 +00002222PyObject *
2223PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002224{
2225 size_t size = strlen(u);
2226 if (size > PY_SSIZE_T_MAX) {
2227 PyErr_SetString(PyExc_OverflowError, "input too long");
2228 return NULL;
2229 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002230 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231}
2232
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002233PyObject *
2234_PyUnicode_FromId(_Py_Identifier *id)
2235{
2236 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002237 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2238 strlen(id->string),
2239 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002240 if (!id->object)
2241 return NULL;
2242 PyUnicode_InternInPlace(&id->object);
2243 assert(!id->next);
2244 id->next = static_strings;
2245 static_strings = id;
2246 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002247 return id->object;
2248}
2249
2250void
2251_PyUnicode_ClearStaticStrings()
2252{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002253 _Py_Identifier *tmp, *s = static_strings;
2254 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002255 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002256 tmp = s->next;
2257 s->next = NULL;
2258 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002259 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002260 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002261}
2262
Benjamin Peterson0df54292012-03-26 14:50:32 -04002263/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002264
Victor Stinnerd3f08822012-05-29 12:57:52 +02002265PyObject*
2266_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002267{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002268 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002269 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002270 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002271#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002272 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002273#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002274 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002275 }
Victor Stinner785938e2011-12-11 20:09:03 +01002276 unicode = PyUnicode_New(size, 127);
2277 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002278 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002279 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2280 assert(_PyUnicode_CheckConsistency(unicode, 1));
2281 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002282}
2283
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002284static Py_UCS4
2285kind_maxchar_limit(unsigned int kind)
2286{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002287 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002288 case PyUnicode_1BYTE_KIND:
2289 return 0x80;
2290 case PyUnicode_2BYTE_KIND:
2291 return 0x100;
2292 case PyUnicode_4BYTE_KIND:
2293 return 0x10000;
2294 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002295 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002296 }
2297}
2298
Victor Stinner702c7342011-10-05 13:50:52 +02002299static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002300_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002301{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002302 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002303 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002304
Serhiy Storchaka678db842013-01-26 12:16:36 +02002305 if (size == 0)
2306 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002307 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002308 if (size == 1)
2309 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002310
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002311 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002312 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002313 if (!res)
2314 return NULL;
2315 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002316 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002317 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002318}
2319
Victor Stinnere57b1c02011-09-28 22:20:48 +02002320static PyObject*
2321_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002322{
2323 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002324 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002325
Serhiy Storchaka678db842013-01-26 12:16:36 +02002326 if (size == 0)
2327 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002328 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002329 if (size == 1)
2330 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002331
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002332 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002333 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002334 if (!res)
2335 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002337 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002338 else {
2339 _PyUnicode_CONVERT_BYTES(
2340 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2341 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002342 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002343 return res;
2344}
2345
Victor Stinnere57b1c02011-09-28 22:20:48 +02002346static PyObject*
2347_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002348{
2349 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002350 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002351
Serhiy Storchaka678db842013-01-26 12:16:36 +02002352 if (size == 0)
2353 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002354 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002355 if (size == 1)
2356 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002357
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002358 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002359 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002360 if (!res)
2361 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002362 if (max_char < 256)
2363 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2364 PyUnicode_1BYTE_DATA(res));
2365 else if (max_char < 0x10000)
2366 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2367 PyUnicode_2BYTE_DATA(res));
2368 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002369 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002370 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002371 return res;
2372}
2373
2374PyObject*
2375PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2376{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002377 if (size < 0) {
2378 PyErr_SetString(PyExc_ValueError, "size must be positive");
2379 return NULL;
2380 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002381 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002382 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002383 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002384 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002385 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002386 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002387 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002388 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002389 PyErr_SetString(PyExc_SystemError, "invalid kind");
2390 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002392}
2393
Victor Stinnerece58de2012-04-23 23:36:38 +02002394Py_UCS4
2395_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2396{
2397 enum PyUnicode_Kind kind;
2398 void *startptr, *endptr;
2399
2400 assert(PyUnicode_IS_READY(unicode));
2401 assert(0 <= start);
2402 assert(end <= PyUnicode_GET_LENGTH(unicode));
2403 assert(start <= end);
2404
2405 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2406 return PyUnicode_MAX_CHAR_VALUE(unicode);
2407
2408 if (start == end)
2409 return 127;
2410
Victor Stinner94d558b2012-04-27 22:26:58 +02002411 if (PyUnicode_IS_ASCII(unicode))
2412 return 127;
2413
Victor Stinnerece58de2012-04-23 23:36:38 +02002414 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002415 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002416 endptr = (char *)startptr + end * kind;
2417 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002418 switch(kind) {
2419 case PyUnicode_1BYTE_KIND:
2420 return ucs1lib_find_max_char(startptr, endptr);
2421 case PyUnicode_2BYTE_KIND:
2422 return ucs2lib_find_max_char(startptr, endptr);
2423 case PyUnicode_4BYTE_KIND:
2424 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002425 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002426 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002427 }
2428}
2429
Victor Stinner25a4b292011-10-06 12:31:55 +02002430/* Ensure that a string uses the most efficient storage, if it is not the
2431 case: create a new string with of the right kind. Write NULL into *p_unicode
2432 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002433static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002434unicode_adjust_maxchar(PyObject **p_unicode)
2435{
2436 PyObject *unicode, *copy;
2437 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002438 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002439 unsigned int kind;
2440
2441 assert(p_unicode != NULL);
2442 unicode = *p_unicode;
2443 assert(PyUnicode_IS_READY(unicode));
2444 if (PyUnicode_IS_ASCII(unicode))
2445 return;
2446
2447 len = PyUnicode_GET_LENGTH(unicode);
2448 kind = PyUnicode_KIND(unicode);
2449 if (kind == PyUnicode_1BYTE_KIND) {
2450 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002451 max_char = ucs1lib_find_max_char(u, u + len);
2452 if (max_char >= 128)
2453 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002454 }
2455 else if (kind == PyUnicode_2BYTE_KIND) {
2456 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002457 max_char = ucs2lib_find_max_char(u, u + len);
2458 if (max_char >= 256)
2459 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002460 }
2461 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002462 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002463 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002464 max_char = ucs4lib_find_max_char(u, u + len);
2465 if (max_char >= 0x10000)
2466 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002467 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002468 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002469 if (copy != NULL)
2470 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002471 Py_DECREF(unicode);
2472 *p_unicode = copy;
2473}
2474
Victor Stinner034f6cf2011-09-30 02:26:44 +02002475PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002476_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002477{
Victor Stinner87af4f22011-11-21 23:03:47 +01002478 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002479 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002480
Victor Stinner034f6cf2011-09-30 02:26:44 +02002481 if (!PyUnicode_Check(unicode)) {
2482 PyErr_BadInternalCall();
2483 return NULL;
2484 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002485 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002486 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002487
Victor Stinner87af4f22011-11-21 23:03:47 +01002488 length = PyUnicode_GET_LENGTH(unicode);
2489 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002490 if (!copy)
2491 return NULL;
2492 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2493
Christian Heimesf051e432016-09-13 20:22:02 +02002494 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002495 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002496 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002497 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002498}
2499
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002500
Victor Stinnerbc603d12011-10-02 01:00:40 +02002501/* Widen Unicode objects to larger buffers. Don't write terminating null
2502 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503
2504void*
2505_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2506{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002507 Py_ssize_t len;
2508 void *result;
2509 unsigned int skind;
2510
Benjamin Petersonbac79492012-01-14 13:34:47 -05002511 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002512 return NULL;
2513
2514 len = PyUnicode_GET_LENGTH(s);
2515 skind = PyUnicode_KIND(s);
2516 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002517 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002518 return NULL;
2519 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002520 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002521 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002522 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002523 if (!result)
2524 return PyErr_NoMemory();
2525 assert(skind == PyUnicode_1BYTE_KIND);
2526 _PyUnicode_CONVERT_BYTES(
2527 Py_UCS1, Py_UCS2,
2528 PyUnicode_1BYTE_DATA(s),
2529 PyUnicode_1BYTE_DATA(s) + len,
2530 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002531 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002532 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002533 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002534 if (!result)
2535 return PyErr_NoMemory();
2536 if (skind == PyUnicode_2BYTE_KIND) {
2537 _PyUnicode_CONVERT_BYTES(
2538 Py_UCS2, Py_UCS4,
2539 PyUnicode_2BYTE_DATA(s),
2540 PyUnicode_2BYTE_DATA(s) + len,
2541 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002542 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002543 else {
2544 assert(skind == PyUnicode_1BYTE_KIND);
2545 _PyUnicode_CONVERT_BYTES(
2546 Py_UCS1, Py_UCS4,
2547 PyUnicode_1BYTE_DATA(s),
2548 PyUnicode_1BYTE_DATA(s) + len,
2549 result);
2550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002551 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002552 default:
2553 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 }
Victor Stinner01698042011-10-04 00:04:26 +02002555 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002556 return NULL;
2557}
2558
2559static Py_UCS4*
2560as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2561 int copy_null)
2562{
2563 int kind;
2564 void *data;
2565 Py_ssize_t len, targetlen;
2566 if (PyUnicode_READY(string) == -1)
2567 return NULL;
2568 kind = PyUnicode_KIND(string);
2569 data = PyUnicode_DATA(string);
2570 len = PyUnicode_GET_LENGTH(string);
2571 targetlen = len;
2572 if (copy_null)
2573 targetlen++;
2574 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002575 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002576 if (!target) {
2577 PyErr_NoMemory();
2578 return NULL;
2579 }
2580 }
2581 else {
2582 if (targetsize < targetlen) {
2583 PyErr_Format(PyExc_SystemError,
2584 "string is longer than the buffer");
2585 if (copy_null && 0 < targetsize)
2586 target[0] = 0;
2587 return NULL;
2588 }
2589 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002590 if (kind == PyUnicode_1BYTE_KIND) {
2591 Py_UCS1 *start = (Py_UCS1 *) data;
2592 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002593 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002594 else if (kind == PyUnicode_2BYTE_KIND) {
2595 Py_UCS2 *start = (Py_UCS2 *) data;
2596 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2597 }
2598 else {
2599 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002600 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002601 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002602 if (copy_null)
2603 target[len] = 0;
2604 return target;
2605}
2606
2607Py_UCS4*
2608PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2609 int copy_null)
2610{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002611 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002612 PyErr_BadInternalCall();
2613 return NULL;
2614 }
2615 return as_ucs4(string, target, targetsize, copy_null);
2616}
2617
2618Py_UCS4*
2619PyUnicode_AsUCS4Copy(PyObject *string)
2620{
2621 return as_ucs4(string, NULL, 0, 1);
2622}
2623
Victor Stinner15a11362012-10-06 23:48:20 +02002624/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002625 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2626 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2627#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002628
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002629static int
2630unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2631 Py_ssize_t width, Py_ssize_t precision)
2632{
2633 Py_ssize_t length, fill, arglen;
2634 Py_UCS4 maxchar;
2635
2636 if (PyUnicode_READY(str) == -1)
2637 return -1;
2638
2639 length = PyUnicode_GET_LENGTH(str);
2640 if ((precision == -1 || precision >= length)
2641 && width <= length)
2642 return _PyUnicodeWriter_WriteStr(writer, str);
2643
2644 if (precision != -1)
2645 length = Py_MIN(precision, length);
2646
2647 arglen = Py_MAX(length, width);
2648 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2649 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2650 else
2651 maxchar = writer->maxchar;
2652
2653 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2654 return -1;
2655
2656 if (width > length) {
2657 fill = width - length;
2658 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2659 return -1;
2660 writer->pos += fill;
2661 }
2662
2663 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2664 str, 0, length);
2665 writer->pos += length;
2666 return 0;
2667}
2668
2669static int
Victor Stinner998b8062018-09-12 00:23:25 +02002670unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002671 Py_ssize_t width, Py_ssize_t precision)
2672{
2673 /* UTF-8 */
2674 Py_ssize_t length;
2675 PyObject *unicode;
2676 int res;
2677
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002678 if (precision == -1) {
2679 length = strlen(str);
2680 }
2681 else {
2682 length = 0;
2683 while (length < precision && str[length]) {
2684 length++;
2685 }
2686 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002687 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2688 if (unicode == NULL)
2689 return -1;
2690
2691 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2692 Py_DECREF(unicode);
2693 return res;
2694}
2695
Victor Stinner96865452011-03-01 23:44:09 +00002696static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002697unicode_fromformat_arg(_PyUnicodeWriter *writer,
2698 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002699{
Victor Stinnere215d962012-10-06 23:03:36 +02002700 const char *p;
2701 Py_ssize_t len;
2702 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002703 Py_ssize_t width;
2704 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002705 int longflag;
2706 int longlongflag;
2707 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002708 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002709
2710 p = f;
2711 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002712 zeropad = 0;
2713 if (*f == '0') {
2714 zeropad = 1;
2715 f++;
2716 }
Victor Stinner96865452011-03-01 23:44:09 +00002717
2718 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002719 width = -1;
2720 if (Py_ISDIGIT((unsigned)*f)) {
2721 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002722 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002723 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002724 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002725 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002726 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002727 return NULL;
2728 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002729 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002730 f++;
2731 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002732 }
2733 precision = -1;
2734 if (*f == '.') {
2735 f++;
2736 if (Py_ISDIGIT((unsigned)*f)) {
2737 precision = (*f - '0');
2738 f++;
2739 while (Py_ISDIGIT((unsigned)*f)) {
2740 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2741 PyErr_SetString(PyExc_ValueError,
2742 "precision too big");
2743 return NULL;
2744 }
2745 precision = (precision * 10) + (*f - '0');
2746 f++;
2747 }
2748 }
Victor Stinner96865452011-03-01 23:44:09 +00002749 if (*f == '%') {
2750 /* "%.3%s" => f points to "3" */
2751 f--;
2752 }
2753 }
2754 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002755 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002756 f--;
2757 }
Victor Stinner96865452011-03-01 23:44:09 +00002758
2759 /* Handle %ld, %lu, %lld and %llu. */
2760 longflag = 0;
2761 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002762 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002763 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002764 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002765 longflag = 1;
2766 ++f;
2767 }
Victor Stinner96865452011-03-01 23:44:09 +00002768 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002769 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002770 longlongflag = 1;
2771 f += 2;
2772 }
Victor Stinner96865452011-03-01 23:44:09 +00002773 }
2774 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002775 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002776 size_tflag = 1;
2777 ++f;
2778 }
Victor Stinnere215d962012-10-06 23:03:36 +02002779
2780 if (f[1] == '\0')
2781 writer->overallocate = 0;
2782
2783 switch (*f) {
2784 case 'c':
2785 {
2786 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002787 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002788 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002789 "character argument not in range(0x110000)");
2790 return NULL;
2791 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002792 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002793 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 break;
2795 }
2796
2797 case 'i':
2798 case 'd':
2799 case 'u':
2800 case 'x':
2801 {
2802 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002803 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002804 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002805
2806 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002807 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002808 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002809 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002810 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002811 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002812 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002813 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002814 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002815 va_arg(*vargs, size_t));
2816 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002817 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002818 va_arg(*vargs, unsigned int));
2819 }
2820 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002821 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002822 }
2823 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002824 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002825 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002826 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002827 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002828 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002829 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002830 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002831 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002832 va_arg(*vargs, Py_ssize_t));
2833 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002834 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002835 va_arg(*vargs, int));
2836 }
2837 assert(len >= 0);
2838
Victor Stinnere215d962012-10-06 23:03:36 +02002839 if (precision < len)
2840 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841
2842 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002843 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2844 return NULL;
2845
Victor Stinnere215d962012-10-06 23:03:36 +02002846 if (width > precision) {
2847 Py_UCS4 fillchar;
2848 fill = width - precision;
2849 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002850 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2851 return NULL;
2852 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002853 }
Victor Stinner15a11362012-10-06 23:48:20 +02002854 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002855 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002856 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2857 return NULL;
2858 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002859 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002860
Victor Stinner4a587072013-11-19 12:54:53 +01002861 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2862 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002863 break;
2864 }
2865
2866 case 'p':
2867 {
2868 char number[MAX_LONG_LONG_CHARS];
2869
2870 len = sprintf(number, "%p", va_arg(*vargs, void*));
2871 assert(len >= 0);
2872
2873 /* %p is ill-defined: ensure leading 0x. */
2874 if (number[1] == 'X')
2875 number[1] = 'x';
2876 else if (number[1] != 'x') {
2877 memmove(number + 2, number,
2878 strlen(number) + 1);
2879 number[0] = '0';
2880 number[1] = 'x';
2881 len += 2;
2882 }
2883
Victor Stinner4a587072013-11-19 12:54:53 +01002884 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002885 return NULL;
2886 break;
2887 }
2888
2889 case 's':
2890 {
2891 /* UTF-8 */
2892 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002893 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002894 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002895 break;
2896 }
2897
2898 case 'U':
2899 {
2900 PyObject *obj = va_arg(*vargs, PyObject *);
2901 assert(obj && _PyUnicode_CHECK(obj));
2902
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002903 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002904 return NULL;
2905 break;
2906 }
2907
2908 case 'V':
2909 {
2910 PyObject *obj = va_arg(*vargs, PyObject *);
2911 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002912 if (obj) {
2913 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002914 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002915 return NULL;
2916 }
2917 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002918 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002919 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002920 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002921 }
2922 break;
2923 }
2924
2925 case 'S':
2926 {
2927 PyObject *obj = va_arg(*vargs, PyObject *);
2928 PyObject *str;
2929 assert(obj);
2930 str = PyObject_Str(obj);
2931 if (!str)
2932 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002933 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002934 Py_DECREF(str);
2935 return NULL;
2936 }
2937 Py_DECREF(str);
2938 break;
2939 }
2940
2941 case 'R':
2942 {
2943 PyObject *obj = va_arg(*vargs, PyObject *);
2944 PyObject *repr;
2945 assert(obj);
2946 repr = PyObject_Repr(obj);
2947 if (!repr)
2948 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002949 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002950 Py_DECREF(repr);
2951 return NULL;
2952 }
2953 Py_DECREF(repr);
2954 break;
2955 }
2956
2957 case 'A':
2958 {
2959 PyObject *obj = va_arg(*vargs, PyObject *);
2960 PyObject *ascii;
2961 assert(obj);
2962 ascii = PyObject_ASCII(obj);
2963 if (!ascii)
2964 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002965 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002966 Py_DECREF(ascii);
2967 return NULL;
2968 }
2969 Py_DECREF(ascii);
2970 break;
2971 }
2972
2973 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002974 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002975 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002976 break;
2977
2978 default:
2979 /* if we stumble upon an unknown formatting code, copy the rest
2980 of the format string to the output string. (we cannot just
2981 skip the code, since there's no way to know what's in the
2982 argument list) */
2983 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002984 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002985 return NULL;
2986 f = p+len;
2987 return f;
2988 }
2989
2990 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002991 return f;
2992}
2993
Walter Dörwaldd2034312007-05-18 16:29:38 +00002994PyObject *
2995PyUnicode_FromFormatV(const char *format, va_list vargs)
2996{
Victor Stinnere215d962012-10-06 23:03:36 +02002997 va_list vargs2;
2998 const char *f;
2999 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003000
Victor Stinner8f674cc2013-04-17 23:02:17 +02003001 _PyUnicodeWriter_Init(&writer);
3002 writer.min_length = strlen(format) + 100;
3003 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003004
Benjamin Peterson0c212142016-09-20 20:39:33 -07003005 // Copy varags to be able to pass a reference to a subfunction.
3006 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003007
3008 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003009 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003010 f = unicode_fromformat_arg(&writer, f, &vargs2);
3011 if (f == NULL)
3012 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003013 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003014 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003015 const char *p;
3016 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003017
Victor Stinnere215d962012-10-06 23:03:36 +02003018 p = f;
3019 do
3020 {
3021 if ((unsigned char)*p > 127) {
3022 PyErr_Format(PyExc_ValueError,
3023 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3024 "string, got a non-ASCII byte: 0x%02x",
3025 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003026 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003027 }
3028 p++;
3029 }
3030 while (*p != '\0' && *p != '%');
3031 len = p - f;
3032
3033 if (*p == '\0')
3034 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003035
3036 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003037 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003038
3039 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003040 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003041 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003042 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003043 return _PyUnicodeWriter_Finish(&writer);
3044
3045 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003046 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003047 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003048 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003049}
3050
Walter Dörwaldd2034312007-05-18 16:29:38 +00003051PyObject *
3052PyUnicode_FromFormat(const char *format, ...)
3053{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003054 PyObject* ret;
3055 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003056
3057#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003059#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003061#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003062 ret = PyUnicode_FromFormatV(format, vargs);
3063 va_end(vargs);
3064 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003065}
3066
Serhiy Storchakac46db922018-10-23 22:58:24 +03003067static Py_ssize_t
3068unicode_get_widechar_size(PyObject *unicode)
3069{
3070 Py_ssize_t res;
3071
3072 assert(unicode != NULL);
3073 assert(_PyUnicode_CHECK(unicode));
3074
3075 if (_PyUnicode_WSTR(unicode) != NULL) {
3076 return PyUnicode_WSTR_LENGTH(unicode);
3077 }
3078 assert(PyUnicode_IS_READY(unicode));
3079
3080 res = _PyUnicode_LENGTH(unicode);
3081#if SIZEOF_WCHAR_T == 2
3082 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3083 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3084 const Py_UCS4 *end = s + res;
3085 for (; s < end; ++s) {
3086 if (*s > 0xFFFF) {
3087 ++res;
3088 }
3089 }
3090 }
3091#endif
3092 return res;
3093}
3094
3095static void
3096unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3097{
3098 const wchar_t *wstr;
3099
3100 assert(unicode != NULL);
3101 assert(_PyUnicode_CHECK(unicode));
3102
3103 wstr = _PyUnicode_WSTR(unicode);
3104 if (wstr != NULL) {
3105 memcpy(w, wstr, size * sizeof(wchar_t));
3106 return;
3107 }
3108 assert(PyUnicode_IS_READY(unicode));
3109
3110 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3111 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3112 for (; size--; ++s, ++w) {
3113 *w = *s;
3114 }
3115 }
3116 else {
3117#if SIZEOF_WCHAR_T == 4
3118 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3119 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3120 for (; size--; ++s, ++w) {
3121 *w = *s;
3122 }
3123#else
3124 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3125 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3126 for (; size--; ++s, ++w) {
3127 Py_UCS4 ch = *s;
3128 if (ch > 0xFFFF) {
3129 assert(ch <= MAX_UNICODE);
3130 /* encode surrogate pair in this case */
3131 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3132 if (!size--)
3133 break;
3134 *w = Py_UNICODE_LOW_SURROGATE(ch);
3135 }
3136 else {
3137 *w = ch;
3138 }
3139 }
3140#endif
3141 }
3142}
3143
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003144#ifdef HAVE_WCHAR_H
3145
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003146/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003147
Victor Stinnerd88d9832011-09-06 02:00:05 +02003148 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003149 character) required to convert the unicode object. Ignore size argument.
3150
Victor Stinnerd88d9832011-09-06 02:00:05 +02003151 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003152 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003153 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003154Py_ssize_t
3155PyUnicode_AsWideChar(PyObject *unicode,
3156 wchar_t *w,
3157 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003158{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003159 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003160
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003161 if (unicode == NULL) {
3162 PyErr_BadInternalCall();
3163 return -1;
3164 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003165 if (!PyUnicode_Check(unicode)) {
3166 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003168 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003169
3170 res = unicode_get_widechar_size(unicode);
3171 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003172 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003173 }
3174
3175 if (size > res) {
3176 size = res + 1;
3177 }
3178 else {
3179 res = size;
3180 }
3181 unicode_copy_as_widechar(unicode, w, size);
3182 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003183}
3184
Victor Stinner137c34c2010-09-29 10:25:54 +00003185wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003186PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003187 Py_ssize_t *size)
3188{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003189 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003190 Py_ssize_t buflen;
3191
3192 if (unicode == NULL) {
3193 PyErr_BadInternalCall();
3194 return NULL;
3195 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003196 if (!PyUnicode_Check(unicode)) {
3197 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003198 return NULL;
3199 }
3200
Serhiy Storchakac46db922018-10-23 22:58:24 +03003201 buflen = unicode_get_widechar_size(unicode);
3202 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003203 if (buffer == NULL) {
3204 PyErr_NoMemory();
3205 return NULL;
3206 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003207 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3208 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003209 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003210 }
3211 else if (wcslen(buffer) != (size_t)buflen) {
3212 PyMem_FREE(buffer);
3213 PyErr_SetString(PyExc_ValueError,
3214 "embedded null character");
3215 return NULL;
3216 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003217 return buffer;
3218}
3219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003220#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003221
Alexander Belopolsky40018472011-02-26 01:02:56 +00003222PyObject *
3223PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003224{
Victor Stinner8faf8212011-12-08 22:14:11 +01003225 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003226 PyErr_SetString(PyExc_ValueError,
3227 "chr() arg not in range(0x110000)");
3228 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003229 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003230
Victor Stinner985a82a2014-01-03 12:53:47 +01003231 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003232}
3233
Alexander Belopolsky40018472011-02-26 01:02:56 +00003234PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003235PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003236{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003237 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003238 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003239 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003240 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003241 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003242 Py_INCREF(obj);
3243 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003244 }
3245 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003246 /* For a Unicode subtype that's not a Unicode object,
3247 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003248 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003249 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003250 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003251 "Can't convert '%.100s' object to str implicitly",
3252 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003253 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003254}
3255
Alexander Belopolsky40018472011-02-26 01:02:56 +00003256PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003257PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003258 const char *encoding,
3259 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003260{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003261 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003262 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003263
Guido van Rossumd57fd912000-03-10 22:53:23 +00003264 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003265 PyErr_BadInternalCall();
3266 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003267 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003268
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003269 /* Decoding bytes objects is the most common case and should be fast */
3270 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003271 if (PyBytes_GET_SIZE(obj) == 0) {
3272 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3273 return NULL;
3274 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003275 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003276 }
3277 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003278 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3279 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003280 }
3281
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003282 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003283 PyErr_SetString(PyExc_TypeError,
3284 "decoding str is not supported");
3285 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003286 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003287
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003288 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3289 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3290 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003291 "decoding to str: need a bytes-like object, %.80s found",
3292 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003293 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003294 }
Tim Petersced69f82003-09-16 20:30:58 +00003295
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003296 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003297 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003298 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3299 return NULL;
3300 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003301 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003302 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003303
Serhiy Storchaka05997252013-01-26 12:14:02 +02003304 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003305 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003306 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003307}
3308
Victor Stinnerebe17e02016-10-12 13:57:45 +02003309/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3310 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3311 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003312int
3313_Py_normalize_encoding(const char *encoding,
3314 char *lower,
3315 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003316{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003317 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003318 char *l;
3319 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003320 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003321
Victor Stinner942889a2016-09-05 15:40:10 -07003322 assert(encoding != NULL);
3323
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003324 e = encoding;
3325 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003326 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003327 punct = 0;
3328 while (1) {
3329 char c = *e;
3330 if (c == 0) {
3331 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003332 }
Victor Stinner942889a2016-09-05 15:40:10 -07003333
3334 if (Py_ISALNUM(c) || c == '.') {
3335 if (punct && l != lower) {
3336 if (l == l_end) {
3337 return 0;
3338 }
3339 *l++ = '_';
3340 }
3341 punct = 0;
3342
3343 if (l == l_end) {
3344 return 0;
3345 }
3346 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003347 }
3348 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003349 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003350 }
Victor Stinner942889a2016-09-05 15:40:10 -07003351
3352 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003353 }
3354 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003355 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003356}
3357
Alexander Belopolsky40018472011-02-26 01:02:56 +00003358PyObject *
3359PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003360 Py_ssize_t size,
3361 const char *encoding,
3362 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003363{
3364 PyObject *buffer = NULL, *unicode;
3365 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003366 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3367
Victor Stinner22eb6892019-06-26 00:51:05 +02003368 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3369 return NULL;
3370 }
3371
Victor Stinnered076ed2019-06-26 01:49:32 +02003372 if (size == 0) {
3373 _Py_RETURN_UNICODE_EMPTY();
3374 }
3375
Victor Stinner942889a2016-09-05 15:40:10 -07003376 if (encoding == NULL) {
3377 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3378 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003379
Fred Drakee4315f52000-05-09 19:53:39 +00003380 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003381 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3382 char *lower = buflower;
3383
3384 /* Fast paths */
3385 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3386 lower += 3;
3387 if (*lower == '_') {
3388 /* Match "utf8" and "utf_8" */
3389 lower++;
3390 }
3391
3392 if (lower[0] == '8' && lower[1] == 0) {
3393 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3394 }
3395 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3396 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3397 }
3398 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3399 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3400 }
3401 }
3402 else {
3403 if (strcmp(lower, "ascii") == 0
3404 || strcmp(lower, "us_ascii") == 0) {
3405 return PyUnicode_DecodeASCII(s, size, errors);
3406 }
Steve Dowercc16be82016-09-08 10:35:16 -07003407 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003408 else if (strcmp(lower, "mbcs") == 0) {
3409 return PyUnicode_DecodeMBCS(s, size, errors);
3410 }
3411 #endif
3412 else if (strcmp(lower, "latin1") == 0
3413 || strcmp(lower, "latin_1") == 0
3414 || strcmp(lower, "iso_8859_1") == 0
3415 || strcmp(lower, "iso8859_1") == 0) {
3416 return PyUnicode_DecodeLatin1(s, size, errors);
3417 }
3418 }
Victor Stinner37296e82010-06-10 13:36:23 +00003419 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003420
3421 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003422 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003423 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003424 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003425 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003426 if (buffer == NULL)
3427 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003428 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003429 if (unicode == NULL)
3430 goto onError;
3431 if (!PyUnicode_Check(unicode)) {
3432 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003433 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003434 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003435 encoding,
3436 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003437 Py_DECREF(unicode);
3438 goto onError;
3439 }
3440 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003441 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003442
Benjamin Peterson29060642009-01-31 22:14:21 +00003443 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003444 Py_XDECREF(buffer);
3445 return NULL;
3446}
3447
Alexander Belopolsky40018472011-02-26 01:02:56 +00003448PyObject *
3449PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003450 const char *encoding,
3451 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003452{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003453 if (!PyUnicode_Check(unicode)) {
3454 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003455 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456 }
3457
Serhiy Storchaka00939072016-10-27 21:05:49 +03003458 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3459 "PyUnicode_AsDecodedObject() is deprecated; "
3460 "use PyCodec_Decode() to decode from str", 1) < 0)
3461 return NULL;
3462
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003463 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003464 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003465
3466 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003467 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003468}
3469
Alexander Belopolsky40018472011-02-26 01:02:56 +00003470PyObject *
3471PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003472 const char *encoding,
3473 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003474{
3475 PyObject *v;
3476
3477 if (!PyUnicode_Check(unicode)) {
3478 PyErr_BadArgument();
3479 goto onError;
3480 }
3481
Serhiy Storchaka00939072016-10-27 21:05:49 +03003482 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3483 "PyUnicode_AsDecodedUnicode() is deprecated; "
3484 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3485 return NULL;
3486
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003487 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003488 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003489
3490 /* Decode via the codec registry */
3491 v = PyCodec_Decode(unicode, encoding, errors);
3492 if (v == NULL)
3493 goto onError;
3494 if (!PyUnicode_Check(v)) {
3495 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003496 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003497 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003498 encoding,
3499 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003500 Py_DECREF(v);
3501 goto onError;
3502 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003503 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003504
Benjamin Peterson29060642009-01-31 22:14:21 +00003505 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003506 return NULL;
3507}
3508
Alexander Belopolsky40018472011-02-26 01:02:56 +00003509PyObject *
3510PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003511 Py_ssize_t size,
3512 const char *encoding,
3513 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003514{
3515 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003516
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003517 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003519 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003520 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3521 Py_DECREF(unicode);
3522 return v;
3523}
3524
Alexander Belopolsky40018472011-02-26 01:02:56 +00003525PyObject *
3526PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003527 const char *encoding,
3528 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003529{
3530 PyObject *v;
3531
3532 if (!PyUnicode_Check(unicode)) {
3533 PyErr_BadArgument();
3534 goto onError;
3535 }
3536
Serhiy Storchaka00939072016-10-27 21:05:49 +03003537 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3538 "PyUnicode_AsEncodedObject() is deprecated; "
3539 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3540 "or PyCodec_Encode() for generic encoding", 1) < 0)
3541 return NULL;
3542
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003543 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003544 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003545
3546 /* Encode via the codec registry */
3547 v = PyCodec_Encode(unicode, encoding, errors);
3548 if (v == NULL)
3549 goto onError;
3550 return v;
3551
Benjamin Peterson29060642009-01-31 22:14:21 +00003552 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003553 return NULL;
3554}
3555
Victor Stinner1b579672011-12-17 05:47:23 +01003556
Victor Stinner2cba6b82018-01-10 22:46:15 +01003557static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003558unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003559 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003560{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003561 Py_ssize_t wlen;
3562 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3563 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003564 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003565 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003566
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003567 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003568 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003569 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003570 return NULL;
3571 }
3572
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003573 char *str;
3574 size_t error_pos;
3575 const char *reason;
3576 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003577 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003578 PyMem_Free(wstr);
3579
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003580 if (res != 0) {
3581 if (res == -2) {
3582 PyObject *exc;
3583 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3584 "locale", unicode,
3585 (Py_ssize_t)error_pos,
3586 (Py_ssize_t)(error_pos+1),
3587 reason);
3588 if (exc != NULL) {
3589 PyCodec_StrictErrors(exc);
3590 Py_DECREF(exc);
3591 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003592 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003593 else if (res == -3) {
3594 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3595 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003596 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003597 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003598 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003599 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003600 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003601
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003602 PyObject *bytes = PyBytes_FromString(str);
3603 PyMem_RawFree(str);
3604 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003605}
3606
Victor Stinnerad158722010-10-27 00:25:46 +00003607PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003608PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3609{
Victor Stinner709d23d2019-05-02 14:56:30 -04003610 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3611 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003612}
3613
3614PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003615PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003616{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003617 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003618#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003619 if (interp->fs_codec.encoding) {
3620 return unicode_encode_utf8(unicode,
3621 interp->fs_codec.error_handler,
3622 interp->fs_codec.errors);
3623 }
3624 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003625 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003626 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003627 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003628 assert(errors != _Py_ERROR_UNKNOWN);
3629 return unicode_encode_utf8(unicode, errors, NULL);
3630 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003631#else
Victor Stinner793b5312011-04-27 00:24:21 +02003632 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3633 cannot use it to encode and decode filenames before it is loaded. Load
3634 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003635 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003636 initialized and the Python codec is loaded.
3637 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003638 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003639 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003640 interp->fs_codec.encoding,
3641 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003642 }
3643 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003644 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003645 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003646 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003647 assert(errors != _Py_ERROR_UNKNOWN);
3648 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003649 }
Victor Stinnerad158722010-10-27 00:25:46 +00003650#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003651}
3652
Alexander Belopolsky40018472011-02-26 01:02:56 +00003653PyObject *
3654PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003655 const char *encoding,
3656 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003657{
3658 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003659 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003660
Guido van Rossumd57fd912000-03-10 22:53:23 +00003661 if (!PyUnicode_Check(unicode)) {
3662 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003663 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003664 }
Fred Drakee4315f52000-05-09 19:53:39 +00003665
Victor Stinner22eb6892019-06-26 00:51:05 +02003666 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3667 return NULL;
3668 }
3669
Victor Stinner942889a2016-09-05 15:40:10 -07003670 if (encoding == NULL) {
3671 return _PyUnicode_AsUTF8String(unicode, errors);
3672 }
3673
Fred Drakee4315f52000-05-09 19:53:39 +00003674 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003675 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3676 char *lower = buflower;
3677
3678 /* Fast paths */
3679 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3680 lower += 3;
3681 if (*lower == '_') {
3682 /* Match "utf8" and "utf_8" */
3683 lower++;
3684 }
3685
3686 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003687 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003688 }
3689 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3690 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3691 }
3692 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3693 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3694 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003695 }
Victor Stinner942889a2016-09-05 15:40:10 -07003696 else {
3697 if (strcmp(lower, "ascii") == 0
3698 || strcmp(lower, "us_ascii") == 0) {
3699 return _PyUnicode_AsASCIIString(unicode, errors);
3700 }
Steve Dowercc16be82016-09-08 10:35:16 -07003701#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003702 else if (strcmp(lower, "mbcs") == 0) {
3703 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3704 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003705#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003706 else if (strcmp(lower, "latin1") == 0 ||
3707 strcmp(lower, "latin_1") == 0 ||
3708 strcmp(lower, "iso_8859_1") == 0 ||
3709 strcmp(lower, "iso8859_1") == 0) {
3710 return _PyUnicode_AsLatin1String(unicode, errors);
3711 }
3712 }
Victor Stinner37296e82010-06-10 13:36:23 +00003713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714
3715 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003716 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003718 return NULL;
3719
3720 /* The normal path */
3721 if (PyBytes_Check(v))
3722 return v;
3723
3724 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003725 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003726 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003727 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003728
3729 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003730 "encoder %s returned bytearray instead of bytes; "
3731 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003732 encoding);
3733 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003734 Py_DECREF(v);
3735 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003736 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003737
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003738 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3739 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003740 Py_DECREF(v);
3741 return b;
3742 }
3743
3744 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003745 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003746 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003747 encoding,
3748 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003749 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003750 return NULL;
3751}
3752
Alexander Belopolsky40018472011-02-26 01:02:56 +00003753PyObject *
3754PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003755 const char *encoding,
3756 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003757{
3758 PyObject *v;
3759
3760 if (!PyUnicode_Check(unicode)) {
3761 PyErr_BadArgument();
3762 goto onError;
3763 }
3764
Serhiy Storchaka00939072016-10-27 21:05:49 +03003765 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3766 "PyUnicode_AsEncodedUnicode() is deprecated; "
3767 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3768 return NULL;
3769
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003770 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003771 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003772
3773 /* Encode via the codec registry */
3774 v = PyCodec_Encode(unicode, encoding, errors);
3775 if (v == NULL)
3776 goto onError;
3777 if (!PyUnicode_Check(v)) {
3778 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003779 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003780 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003781 encoding,
3782 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003783 Py_DECREF(v);
3784 goto onError;
3785 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003786 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003787
Benjamin Peterson29060642009-01-31 22:14:21 +00003788 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003789 return NULL;
3790}
3791
Victor Stinner2cba6b82018-01-10 22:46:15 +01003792static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003793unicode_decode_locale(const char *str, Py_ssize_t len,
3794 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003795{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003796 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3797 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003798 return NULL;
3799 }
3800
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003801 wchar_t *wstr;
3802 size_t wlen;
3803 const char *reason;
3804 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003805 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003806 if (res != 0) {
3807 if (res == -2) {
3808 PyObject *exc;
3809 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3810 "locale", str, len,
3811 (Py_ssize_t)wlen,
3812 (Py_ssize_t)(wlen + 1),
3813 reason);
3814 if (exc != NULL) {
3815 PyCodec_StrictErrors(exc);
3816 Py_DECREF(exc);
3817 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003818 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003819 else if (res == -3) {
3820 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3821 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003822 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003823 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003824 }
Victor Stinner2f197072011-12-17 07:08:30 +01003825 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003826 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003827
3828 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3829 PyMem_RawFree(wstr);
3830 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003831}
3832
3833PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003834PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3835 const char *errors)
3836{
Victor Stinner709d23d2019-05-02 14:56:30 -04003837 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3838 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003839}
3840
3841PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003842PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003843{
3844 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003845 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3846 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003847}
3848
3849
3850PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003851PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003852 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003853 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3854}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003855
Christian Heimes5894ba72007-11-04 11:43:14 +00003856PyObject*
3857PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3858{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003859 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003860#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003861 if (interp->fs_codec.encoding) {
3862 return unicode_decode_utf8(s, size,
3863 interp->fs_codec.error_handler,
3864 interp->fs_codec.errors,
3865 NULL);
3866 }
3867 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003868 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003869 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003870 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003871 assert(errors != _Py_ERROR_UNKNOWN);
3872 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3873 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003874#else
Victor Stinner793b5312011-04-27 00:24:21 +02003875 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3876 cannot use it to encode and decode filenames before it is loaded. Load
3877 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003878 implementation of the locale codec until the codec registry is
Victor Stinner22eb6892019-06-26 00:51:05 +02003879 initialized and the Python codec is loaded.
3880 See _PyUnicode_InitEncodings(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003881 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003882 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003883 interp->fs_codec.encoding,
3884 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003885 }
3886 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003887 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003888 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003889 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003890 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003891 }
Victor Stinnerad158722010-10-27 00:25:46 +00003892#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003893}
3894
Martin v. Löwis011e8422009-05-05 04:43:17 +00003895
3896int
3897PyUnicode_FSConverter(PyObject* arg, void* addr)
3898{
Brett Cannonec6ce872016-09-06 15:50:29 -07003899 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003900 PyObject *output = NULL;
3901 Py_ssize_t size;
3902 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003903 if (arg == NULL) {
3904 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003905 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003906 return 1;
3907 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003908 path = PyOS_FSPath(arg);
3909 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003910 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003911 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003912 if (PyBytes_Check(path)) {
3913 output = path;
3914 }
3915 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3916 output = PyUnicode_EncodeFSDefault(path);
3917 Py_DECREF(path);
3918 if (!output) {
3919 return 0;
3920 }
3921 assert(PyBytes_Check(output));
3922 }
3923
Victor Stinner0ea2a462010-04-30 00:22:08 +00003924 size = PyBytes_GET_SIZE(output);
3925 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003926 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003927 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003928 Py_DECREF(output);
3929 return 0;
3930 }
3931 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003932 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003933}
3934
3935
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003936int
3937PyUnicode_FSDecoder(PyObject* arg, void* addr)
3938{
Brett Cannona5711202016-09-06 19:36:01 -07003939 int is_buffer = 0;
3940 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003941 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003942 if (arg == NULL) {
3943 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003944 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003945 return 1;
3946 }
Brett Cannona5711202016-09-06 19:36:01 -07003947
3948 is_buffer = PyObject_CheckBuffer(arg);
3949 if (!is_buffer) {
3950 path = PyOS_FSPath(arg);
3951 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003952 return 0;
3953 }
Brett Cannona5711202016-09-06 19:36:01 -07003954 }
3955 else {
3956 path = arg;
3957 Py_INCREF(arg);
3958 }
3959
3960 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003961 output = path;
3962 }
3963 else if (PyBytes_Check(path) || is_buffer) {
3964 PyObject *path_bytes = NULL;
3965
3966 if (!PyBytes_Check(path) &&
3967 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003968 "path should be string, bytes, or os.PathLike, not %.200s",
3969 Py_TYPE(arg)->tp_name)) {
3970 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003971 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003972 }
3973 path_bytes = PyBytes_FromObject(path);
3974 Py_DECREF(path);
3975 if (!path_bytes) {
3976 return 0;
3977 }
3978 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3979 PyBytes_GET_SIZE(path_bytes));
3980 Py_DECREF(path_bytes);
3981 if (!output) {
3982 return 0;
3983 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003984 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003985 else {
3986 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003987 "path should be string, bytes, or os.PathLike, not %.200s",
3988 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003989 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003990 return 0;
3991 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003992 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003993 Py_DECREF(output);
3994 return 0;
3995 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003997 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003998 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003999 Py_DECREF(output);
4000 return 0;
4001 }
4002 *(PyObject**)addr = output;
4003 return Py_CLEANUP_SUPPORTED;
4004}
4005
4006
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004007const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004009{
Christian Heimesf3863112007-11-22 07:46:41 +00004010 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004011
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00004012 if (!PyUnicode_Check(unicode)) {
4013 PyErr_BadArgument();
4014 return NULL;
4015 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004016 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004017 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004019 if (PyUnicode_UTF8(unicode) == NULL) {
4020 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03004021 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004022 if (bytes == NULL)
4023 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004024 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
4025 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01004026 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004027 Py_DECREF(bytes);
4028 return NULL;
4029 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004030 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02004031 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004032 PyBytes_AS_STRING(bytes),
4033 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004034 Py_DECREF(bytes);
4035 }
4036
4037 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004038 *psize = PyUnicode_UTF8_LENGTH(unicode);
4039 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004040}
4041
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004042const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004043PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004044{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4046}
4047
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048Py_UNICODE *
4049PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4050{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 if (!PyUnicode_Check(unicode)) {
4052 PyErr_BadArgument();
4053 return NULL;
4054 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004055 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4056 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004057 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004058 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004059 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004060
Serhiy Storchakac46db922018-10-23 22:58:24 +03004061 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4062 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4063 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004064 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004065 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004066 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4067 if (w == NULL) {
4068 PyErr_NoMemory();
4069 return NULL;
4070 }
4071 unicode_copy_as_widechar(unicode, w, wlen + 1);
4072 _PyUnicode_WSTR(unicode) = w;
4073 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4074 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075 }
4076 }
4077 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004078 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004079 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004080}
4081
Alexander Belopolsky40018472011-02-26 01:02:56 +00004082Py_UNICODE *
4083PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004084{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004085 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004086}
4087
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004088const Py_UNICODE *
4089_PyUnicode_AsUnicode(PyObject *unicode)
4090{
4091 Py_ssize_t size;
4092 const Py_UNICODE *wstr;
4093
4094 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4095 if (wstr && wcslen(wstr) != (size_t)size) {
4096 PyErr_SetString(PyExc_ValueError, "embedded null character");
4097 return NULL;
4098 }
4099 return wstr;
4100}
4101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102
Alexander Belopolsky40018472011-02-26 01:02:56 +00004103Py_ssize_t
4104PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004105{
4106 if (!PyUnicode_Check(unicode)) {
4107 PyErr_BadArgument();
4108 goto onError;
4109 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004110 if (_PyUnicode_WSTR(unicode) == NULL) {
4111 if (PyUnicode_AsUnicode(unicode) == NULL)
4112 goto onError;
4113 }
4114 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004115
Benjamin Peterson29060642009-01-31 22:14:21 +00004116 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004117 return -1;
4118}
4119
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120Py_ssize_t
4121PyUnicode_GetLength(PyObject *unicode)
4122{
Victor Stinner07621332012-06-16 04:53:46 +02004123 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004124 PyErr_BadArgument();
4125 return -1;
4126 }
Victor Stinner07621332012-06-16 04:53:46 +02004127 if (PyUnicode_READY(unicode) == -1)
4128 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004129 return PyUnicode_GET_LENGTH(unicode);
4130}
4131
4132Py_UCS4
4133PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4134{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004135 void *data;
4136 int kind;
4137
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004138 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004139 PyErr_BadArgument();
4140 return (Py_UCS4)-1;
4141 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004142 if (PyUnicode_READY(unicode) == -1) {
4143 return (Py_UCS4)-1;
4144 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004145 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004146 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004147 return (Py_UCS4)-1;
4148 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004149 data = PyUnicode_DATA(unicode);
4150 kind = PyUnicode_KIND(unicode);
4151 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004152}
4153
4154int
4155PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4156{
4157 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004158 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004159 return -1;
4160 }
Victor Stinner488fa492011-12-12 00:01:39 +01004161 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004162 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004163 PyErr_SetString(PyExc_IndexError, "string index out of range");
4164 return -1;
4165 }
Victor Stinner488fa492011-12-12 00:01:39 +01004166 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004167 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004168 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4169 PyErr_SetString(PyExc_ValueError, "character out of range");
4170 return -1;
4171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004172 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4173 index, ch);
4174 return 0;
4175}
4176
Alexander Belopolsky40018472011-02-26 01:02:56 +00004177const char *
4178PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004179{
Victor Stinner42cb4622010-09-01 19:39:01 +00004180 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004181}
4182
Victor Stinner554f3f02010-06-16 23:33:54 +00004183/* create or adjust a UnicodeDecodeError */
4184static void
4185make_decode_exception(PyObject **exceptionObject,
4186 const char *encoding,
4187 const char *input, Py_ssize_t length,
4188 Py_ssize_t startpos, Py_ssize_t endpos,
4189 const char *reason)
4190{
4191 if (*exceptionObject == NULL) {
4192 *exceptionObject = PyUnicodeDecodeError_Create(
4193 encoding, input, length, startpos, endpos, reason);
4194 }
4195 else {
4196 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4197 goto onError;
4198 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4199 goto onError;
4200 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4201 goto onError;
4202 }
4203 return;
4204
4205onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004206 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004207}
4208
Steve Dowercc16be82016-09-08 10:35:16 -07004209#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004210static int
4211widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4212{
4213 if (newsize > *size) {
4214 wchar_t *newbuf = *buf;
4215 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4216 PyErr_NoMemory();
4217 return -1;
4218 }
4219 *buf = newbuf;
4220 }
4221 *size = newsize;
4222 return 0;
4223}
4224
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004225/* error handling callback helper:
4226 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004227 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004228 and adjust various state variables.
4229 return 0 on success, -1 on error
4230*/
4231
Alexander Belopolsky40018472011-02-26 01:02:56 +00004232static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233unicode_decode_call_errorhandler_wchar(
4234 const char *errors, PyObject **errorHandler,
4235 const char *encoding, const char *reason,
4236 const char **input, const char **inend, Py_ssize_t *startinpos,
4237 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004238 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004239{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004240 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004241
4242 PyObject *restuple = NULL;
4243 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004244 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004245 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004246 Py_ssize_t requiredsize;
4247 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004248 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004249 wchar_t *repwstr;
4250 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004251
4252 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004253 *errorHandler = PyCodec_LookupError(errors);
4254 if (*errorHandler == NULL)
4255 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004256 }
4257
Victor Stinner554f3f02010-06-16 23:33:54 +00004258 make_decode_exception(exceptionObject,
4259 encoding,
4260 *input, *inend - *input,
4261 *startinpos, *endinpos,
4262 reason);
4263 if (*exceptionObject == NULL)
4264 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004265
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004266 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004267 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004268 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004269 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004270 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004271 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004272 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004273 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004274 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004275
4276 /* Copy back the bytes variables, which might have been modified by the
4277 callback */
4278 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4279 if (!inputobj)
4280 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004281 *input = PyBytes_AS_STRING(inputobj);
4282 insize = PyBytes_GET_SIZE(inputobj);
4283 *inend = *input + insize;
4284 /* we can DECREF safely, as the exception has another reference,
4285 so the object won't go away. */
4286 Py_DECREF(inputobj);
4287
4288 if (newpos<0)
4289 newpos = insize+newpos;
4290 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004291 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 goto onError;
4293 }
4294
4295 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4296 if (repwstr == NULL)
4297 goto onError;
4298 /* need more space? (at least enough for what we
4299 have+the replacement+the rest of the string (starting
4300 at the new input position), so we won't have to check space
4301 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004302 requiredsize = *outpos;
4303 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4304 goto overflow;
4305 requiredsize += repwlen;
4306 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4307 goto overflow;
4308 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004309 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004310 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004311 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004312 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004313 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004314 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004315 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004316 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004317 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004318 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004319 *endinpos = newpos;
4320 *inptr = *input + newpos;
4321
4322 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004323 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004324 return 0;
4325
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004326 overflow:
4327 PyErr_SetString(PyExc_OverflowError,
4328 "decoded result is too long for a Python string");
4329
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004330 onError:
4331 Py_XDECREF(restuple);
4332 return -1;
4333}
Steve Dowercc16be82016-09-08 10:35:16 -07004334#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004335
4336static int
4337unicode_decode_call_errorhandler_writer(
4338 const char *errors, PyObject **errorHandler,
4339 const char *encoding, const char *reason,
4340 const char **input, const char **inend, Py_ssize_t *startinpos,
4341 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4342 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4343{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004344 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004345
4346 PyObject *restuple = NULL;
4347 PyObject *repunicode = NULL;
4348 Py_ssize_t insize;
4349 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004350 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004351 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004352 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004353 int need_to_grow = 0;
4354 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004355
4356 if (*errorHandler == NULL) {
4357 *errorHandler = PyCodec_LookupError(errors);
4358 if (*errorHandler == NULL)
4359 goto onError;
4360 }
4361
4362 make_decode_exception(exceptionObject,
4363 encoding,
4364 *input, *inend - *input,
4365 *startinpos, *endinpos,
4366 reason);
4367 if (*exceptionObject == NULL)
4368 goto onError;
4369
Jeroen Demeyer196a5302019-07-04 12:31:34 +02004370 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004371 if (restuple == NULL)
4372 goto onError;
4373 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004374 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004375 goto onError;
4376 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004377 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004378 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004379
4380 /* Copy back the bytes variables, which might have been modified by the
4381 callback */
4382 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4383 if (!inputobj)
4384 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004385 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004386 *input = PyBytes_AS_STRING(inputobj);
4387 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004388 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004389 /* we can DECREF safely, as the exception has another reference,
4390 so the object won't go away. */
4391 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004392
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004393 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004394 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004395 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004396 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004397 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004398 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004399
Victor Stinner170ca6f2013-04-18 00:25:28 +02004400 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004401 if (replen > 1) {
4402 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004403 need_to_grow = 1;
4404 }
4405 new_inptr = *input + newpos;
4406 if (*inend - new_inptr > remain) {
4407 /* We don't know the decoding algorithm here so we make the worst
4408 assumption that one byte decodes to one unicode character.
4409 If unfortunately one byte could decode to more unicode characters,
4410 the decoder may write out-of-bound then. Is it possible for the
4411 algorithms using this function? */
4412 writer->min_length += *inend - new_inptr - remain;
4413 need_to_grow = 1;
4414 }
4415 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004416 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004417 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004418 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4419 goto onError;
4420 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004421 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004422 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004423
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004424 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004425 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004426
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004427 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004428 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004429 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004430
Benjamin Peterson29060642009-01-31 22:14:21 +00004431 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004432 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004433 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004434}
4435
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004436/* --- UTF-7 Codec -------------------------------------------------------- */
4437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438/* See RFC2152 for details. We encode conservatively and decode liberally. */
4439
4440/* Three simple macros defining base-64. */
4441
4442/* Is c a base-64 character? */
4443
4444#define IS_BASE64(c) \
4445 (((c) >= 'A' && (c) <= 'Z') || \
4446 ((c) >= 'a' && (c) <= 'z') || \
4447 ((c) >= '0' && (c) <= '9') || \
4448 (c) == '+' || (c) == '/')
4449
4450/* given that c is a base-64 character, what is its base-64 value? */
4451
4452#define FROM_BASE64(c) \
4453 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4454 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4455 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4456 (c) == '+' ? 62 : 63)
4457
4458/* What is the base-64 character of the bottom 6 bits of n? */
4459
4460#define TO_BASE64(n) \
4461 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4462
4463/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4464 * decoded as itself. We are permissive on decoding; the only ASCII
4465 * byte not decoding to itself is the + which begins a base64
4466 * string. */
4467
4468#define DECODE_DIRECT(c) \
4469 ((c) <= 127 && (c) != '+')
4470
4471/* The UTF-7 encoder treats ASCII characters differently according to
4472 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4473 * the above). See RFC2152. This array identifies these different
4474 * sets:
4475 * 0 : "Set D"
4476 * alphanumeric and '(),-./:?
4477 * 1 : "Set O"
4478 * !"#$%&*;<=>@[]^_`{|}
4479 * 2 : "whitespace"
4480 * ht nl cr sp
4481 * 3 : special (must be base64 encoded)
4482 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4483 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484
Tim Petersced69f82003-09-16 20:30:58 +00004485static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004486char utf7_category[128] = {
4487/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4488 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4489/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4490 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4491/* sp ! " # $ % & ' ( ) * + , - . / */
4492 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4493/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4494 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4495/* @ A B C D E F G H I J K L M N O */
4496 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4497/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4498 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4499/* ` a b c d e f g h i j k l m n o */
4500 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4501/* p q r s t u v w x y z { | } ~ del */
4502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004503};
4504
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505/* ENCODE_DIRECT: this character should be encoded as itself. The
4506 * answer depends on whether we are encoding set O as itself, and also
4507 * on whether we are encoding whitespace as itself. RFC2152 makes it
4508 * clear that the answers to these questions vary between
4509 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004510
Antoine Pitrou244651a2009-05-04 18:56:13 +00004511#define ENCODE_DIRECT(c, directO, directWS) \
4512 ((c) < 128 && (c) > 0 && \
4513 ((utf7_category[(c)] == 0) || \
4514 (directWS && (utf7_category[(c)] == 2)) || \
4515 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516
Alexander Belopolsky40018472011-02-26 01:02:56 +00004517PyObject *
4518PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004519 Py_ssize_t size,
4520 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004521{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004522 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4523}
4524
Antoine Pitrou244651a2009-05-04 18:56:13 +00004525/* The decoder. The only state we preserve is our read position,
4526 * i.e. how many characters we have consumed. So if we end in the
4527 * middle of a shift sequence we have to back off the read position
4528 * and the output to the beginning of the sequence, otherwise we lose
4529 * all the shift state (seen bits, number of bits seen, high
4530 * surrogate). */
4531
Alexander Belopolsky40018472011-02-26 01:02:56 +00004532PyObject *
4533PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004534 Py_ssize_t size,
4535 const char *errors,
4536 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004537{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004538 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004539 Py_ssize_t startinpos;
4540 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004542 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004543 const char *errmsg = "";
4544 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004545 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 unsigned int base64bits = 0;
4547 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004548 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004549 PyObject *errorHandler = NULL;
4550 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004551
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004552 if (size == 0) {
4553 if (consumed)
4554 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004555 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004556 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004557
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004558 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004559 _PyUnicodeWriter_Init(&writer);
4560 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004561
4562 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004563 e = s + size;
4564
4565 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004566 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004567 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004568 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004569
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 if (inShift) { /* in a base-64 section */
4571 if (IS_BASE64(ch)) { /* consume a base-64 character */
4572 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4573 base64bits += 6;
4574 s++;
4575 if (base64bits >= 16) {
4576 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004577 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 base64bits -= 16;
4579 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004580 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (surrogate) {
4582 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004583 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4584 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004585 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004586 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004587 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004588 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 }
4590 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004591 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004592 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004593 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004594 }
4595 }
Victor Stinner551ac952011-11-29 22:58:13 +01004596 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004597 /* first surrogate */
4598 surrogate = outCh;
4599 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004601 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004602 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 }
4604 }
4605 }
4606 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004607 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 if (base64bits > 0) { /* left-over bits */
4609 if (base64bits >= 6) {
4610 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004611 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004612 errmsg = "partial character in shift sequence";
4613 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004614 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004615 else {
4616 /* Some bits remain; they should be zero */
4617 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004618 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619 errmsg = "non-zero padding bits in shift sequence";
4620 goto utf7Error;
4621 }
4622 }
4623 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004624 if (surrogate && DECODE_DIRECT(ch)) {
4625 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4626 goto onError;
4627 }
4628 surrogate = 0;
4629 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 /* '-' is absorbed; other terminating
4631 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004632 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004633 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 }
4635 }
4636 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004637 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004638 s++; /* consume '+' */
4639 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004640 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004641 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004642 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004643 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004644 else if (s < e && !IS_BASE64(*s)) {
4645 s++;
4646 errmsg = "ill-formed sequence";
4647 goto utf7Error;
4648 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004649 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004651 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004652 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004653 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004654 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004655 }
4656 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004658 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004659 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004660 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004661 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 else {
4663 startinpos = s-starts;
4664 s++;
4665 errmsg = "unexpected special character";
4666 goto utf7Error;
4667 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004668 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004670 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004671 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004672 errors, &errorHandler,
4673 "utf7", errmsg,
4674 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004675 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004676 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004677 }
4678
Antoine Pitrou244651a2009-05-04 18:56:13 +00004679 /* end of string */
4680
4681 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4682 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004683 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004684 if (surrogate ||
4685 (base64bits >= 6) ||
4686 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004687 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004688 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004689 errors, &errorHandler,
4690 "utf7", "unterminated shift sequence",
4691 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004692 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004693 goto onError;
4694 if (s < e)
4695 goto restart;
4696 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004697 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004698
4699 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004700 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004701 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004702 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004703 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004704 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004705 writer.kind, writer.data, shiftOutStart);
4706 Py_XDECREF(errorHandler);
4707 Py_XDECREF(exc);
4708 _PyUnicodeWriter_Dealloc(&writer);
4709 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004710 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004711 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004712 }
4713 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004714 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004715 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004716 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004718 Py_XDECREF(errorHandler);
4719 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004720 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004721
Benjamin Peterson29060642009-01-31 22:14:21 +00004722 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004723 Py_XDECREF(errorHandler);
4724 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004725 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004726 return NULL;
4727}
4728
4729
Alexander Belopolsky40018472011-02-26 01:02:56 +00004730PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004731_PyUnicode_EncodeUTF7(PyObject *str,
4732 int base64SetO,
4733 int base64WhiteSpace,
4734 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004735{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004736 int kind;
4737 void *data;
4738 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004739 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004740 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004741 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 unsigned int base64bits = 0;
4743 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004744 char * out;
4745 char * start;
4746
Benjamin Petersonbac79492012-01-14 13:34:47 -05004747 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004748 return NULL;
4749 kind = PyUnicode_KIND(str);
4750 data = PyUnicode_DATA(str);
4751 len = PyUnicode_GET_LENGTH(str);
4752
4753 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004754 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004755
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004756 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004757 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004758 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004759 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004760 if (v == NULL)
4761 return NULL;
4762
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004763 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004764 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004765 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004766
Antoine Pitrou244651a2009-05-04 18:56:13 +00004767 if (inShift) {
4768 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4769 /* shifting out */
4770 if (base64bits) { /* output remaining bits */
4771 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4772 base64buffer = 0;
4773 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774 }
4775 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 /* Characters not in the BASE64 set implicitly unshift the sequence
4777 so no '-' is required, except if the character is itself a '-' */
4778 if (IS_BASE64(ch) || ch == '-') {
4779 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004781 *out++ = (char) ch;
4782 }
4783 else {
4784 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004785 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004786 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004787 else { /* not in a shift sequence */
4788 if (ch == '+') {
4789 *out++ = '+';
4790 *out++ = '-';
4791 }
4792 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4793 *out++ = (char) ch;
4794 }
4795 else {
4796 *out++ = '+';
4797 inShift = 1;
4798 goto encode_char;
4799 }
4800 }
4801 continue;
4802encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004803 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004804 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004805
Antoine Pitrou244651a2009-05-04 18:56:13 +00004806 /* code first surrogate */
4807 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004808 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004809 while (base64bits >= 6) {
4810 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4811 base64bits -= 6;
4812 }
4813 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004814 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004815 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004816 base64bits += 16;
4817 base64buffer = (base64buffer << 16) | ch;
4818 while (base64bits >= 6) {
4819 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4820 base64bits -= 6;
4821 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004822 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004823 if (base64bits)
4824 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4825 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004826 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004827 if (_PyBytes_Resize(&v, out - start) < 0)
4828 return NULL;
4829 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004830}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004831PyObject *
4832PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4833 Py_ssize_t size,
4834 int base64SetO,
4835 int base64WhiteSpace,
4836 const char *errors)
4837{
4838 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004839 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004840 if (tmp == NULL)
4841 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004842 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004843 base64WhiteSpace, errors);
4844 Py_DECREF(tmp);
4845 return result;
4846}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004847
Antoine Pitrou244651a2009-05-04 18:56:13 +00004848#undef IS_BASE64
4849#undef FROM_BASE64
4850#undef TO_BASE64
4851#undef DECODE_DIRECT
4852#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004853
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854/* --- UTF-8 Codec -------------------------------------------------------- */
4855
Alexander Belopolsky40018472011-02-26 01:02:56 +00004856PyObject *
4857PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004858 Py_ssize_t size,
4859 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004860{
Walter Dörwald69652032004-09-07 20:24:22 +00004861 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4862}
4863
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004864#include "stringlib/asciilib.h"
4865#include "stringlib/codecs.h"
4866#include "stringlib/undef.h"
4867
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004868#include "stringlib/ucs1lib.h"
4869#include "stringlib/codecs.h"
4870#include "stringlib/undef.h"
4871
4872#include "stringlib/ucs2lib.h"
4873#include "stringlib/codecs.h"
4874#include "stringlib/undef.h"
4875
4876#include "stringlib/ucs4lib.h"
4877#include "stringlib/codecs.h"
4878#include "stringlib/undef.h"
4879
Antoine Pitrouab868312009-01-10 15:40:25 +00004880/* Mask to quickly check whether a C 'long' contains a
4881 non-ASCII, UTF8-encoded char. */
4882#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004883# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004884#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004885# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004886#else
4887# error C 'long' size should be either 4 or 8!
4888#endif
4889
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004890static Py_ssize_t
4891ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004892{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004893 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004894 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004895
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004896 /*
4897 * Issue #17237: m68k is a bit different from most architectures in
4898 * that objects do not use "natural alignment" - for example, int and
4899 * long are only aligned at 2-byte boundaries. Therefore the assert()
4900 * won't work; also, tests have shown that skipping the "optimised
4901 * version" will even speed up m68k.
4902 */
4903#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004904#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004905 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4906 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004907 /* Fast path, see in STRINGLIB(utf8_decode) for
4908 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004909 /* Help allocation */
4910 const char *_p = p;
4911 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004912 while (_p < aligned_end) {
4913 unsigned long value = *(const unsigned long *) _p;
4914 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004915 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 *((unsigned long *)q) = value;
4917 _p += SIZEOF_LONG;
4918 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004919 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920 p = _p;
4921 while (p < end) {
4922 if ((unsigned char)*p & 0x80)
4923 break;
4924 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004926 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004927 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004928#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004929#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 while (p < end) {
4931 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4932 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004933 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004934 /* Help allocation */
4935 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004936 while (_p < aligned_end) {
4937 unsigned long value = *(unsigned long *) _p;
4938 if (value & ASCII_CHAR_MASK)
4939 break;
4940 _p += SIZEOF_LONG;
4941 }
4942 p = _p;
4943 if (_p == end)
4944 break;
4945 }
4946 if ((unsigned char)*p & 0x80)
4947 break;
4948 ++p;
4949 }
4950 memcpy(dest, start, p - start);
4951 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004952}
Antoine Pitrouab868312009-01-10 15:40:25 +00004953
Victor Stinner709d23d2019-05-02 14:56:30 -04004954static PyObject *
4955unicode_decode_utf8(const char *s, Py_ssize_t size,
4956 _Py_error_handler error_handler, const char *errors,
4957 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004958{
Victor Stinner785938e2011-12-11 20:09:03 +01004959 if (size == 0) {
4960 if (consumed)
4961 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004962 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004963 }
4964
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004965 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4966 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004967 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968 *consumed = 1;
4969 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004970 }
4971
Inada Naoki770847a2019-06-24 12:30:24 +09004972 const char *starts = s;
4973 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004974
Inada Naoki770847a2019-06-24 12:30:24 +09004975 // fast path: try ASCII string.
4976 PyObject *u = PyUnicode_New(size, 127);
4977 if (u == NULL) {
4978 return NULL;
4979 }
4980 s += ascii_decode(s, end, PyUnicode_DATA(u));
4981 if (s == end) {
4982 return u;
4983 }
4984
4985 // Use _PyUnicodeWriter after fast path is failed.
4986 _PyUnicodeWriter writer;
4987 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4988 writer.pos = s - starts;
4989
4990 Py_ssize_t startinpos, endinpos;
4991 const char *errmsg = "";
4992 PyObject *error_handler_obj = NULL;
4993 PyObject *exc = NULL;
4994
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995 while (s < end) {
4996 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004997 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004998
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005000 if (PyUnicode_IS_ASCII(writer.buffer))
5001 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005003 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005005 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006 } else {
5007 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005008 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 }
5010
5011 switch (ch) {
5012 case 0:
5013 if (s == end || consumed)
5014 goto End;
5015 errmsg = "unexpected end of data";
5016 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005017 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005018 break;
5019 case 1:
5020 errmsg = "invalid start byte";
5021 startinpos = s - starts;
5022 endinpos = startinpos + 1;
5023 break;
5024 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005025 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
5026 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5027 {
5028 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005029 goto End;
5030 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005031 /* fall through */
5032 case 3:
5033 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005034 errmsg = "invalid continuation byte";
5035 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005036 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005037 break;
5038 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005039 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005040 goto onError;
5041 continue;
5042 }
5043
Victor Stinner1d65d912015-10-05 13:43:50 +02005044 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005045 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005046
5047 switch (error_handler) {
5048 case _Py_ERROR_IGNORE:
5049 s += (endinpos - startinpos);
5050 break;
5051
5052 case _Py_ERROR_REPLACE:
5053 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5054 goto onError;
5055 s += (endinpos - startinpos);
5056 break;
5057
5058 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005059 {
5060 Py_ssize_t i;
5061
Victor Stinner1d65d912015-10-05 13:43:50 +02005062 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5063 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005064 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005065 ch = (Py_UCS4)(unsigned char)(starts[i]);
5066 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5067 ch + 0xdc00);
5068 writer.pos++;
5069 }
5070 s += (endinpos - startinpos);
5071 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005072 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005073
5074 default:
5075 if (unicode_decode_call_errorhandler_writer(
5076 errors, &error_handler_obj,
5077 "utf-8", errmsg,
5078 &starts, &end, &startinpos, &endinpos, &exc, &s,
5079 &writer))
5080 goto onError;
5081 }
Victor Stinner785938e2011-12-11 20:09:03 +01005082 }
5083
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005084End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 if (consumed)
5086 *consumed = s - starts;
5087
Victor Stinner1d65d912015-10-05 13:43:50 +02005088 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005090 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005091
5092onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005093 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005094 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005095 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005096 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005097}
5098
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005099
Victor Stinner709d23d2019-05-02 14:56:30 -04005100PyObject *
5101PyUnicode_DecodeUTF8Stateful(const char *s,
5102 Py_ssize_t size,
5103 const char *errors,
5104 Py_ssize_t *consumed)
5105{
5106 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5107}
5108
5109
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005110/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5111 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005112
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005113 On success, write a pointer to a newly allocated wide character string into
5114 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5115 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005116
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005117 On memory allocation failure, return -1.
5118
5119 On decoding error (if surrogateescape is zero), return -2. If wlen is
5120 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5121 is not NULL, write the decoding error message into *reason. */
5122int
5123_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005124 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005125{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005126 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005127 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005128 wchar_t *unicode;
5129 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130
Victor Stinner3d4226a2018-08-29 22:21:32 +02005131 int surrogateescape = 0;
5132 int surrogatepass = 0;
5133 switch (errors)
5134 {
5135 case _Py_ERROR_STRICT:
5136 break;
5137 case _Py_ERROR_SURROGATEESCAPE:
5138 surrogateescape = 1;
5139 break;
5140 case _Py_ERROR_SURROGATEPASS:
5141 surrogatepass = 1;
5142 break;
5143 default:
5144 return -3;
5145 }
5146
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005147 /* Note: size will always be longer than the resulting Unicode
5148 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005149 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005150 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005151 }
5152
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005153 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005154 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005155 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005156 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005157
5158 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005159 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005160 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005161 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005162 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005163#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005164 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005165#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005166 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005167#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005168 if (ch > 0xFF) {
5169#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005170 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005171#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005172 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005173 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005174 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5175 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5176#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005177 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005178 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005179 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005180 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005181 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005182
5183 if (surrogateescape) {
5184 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5185 }
5186 else {
5187 /* Is it a valid three-byte code? */
5188 if (surrogatepass
5189 && (e - s) >= 3
5190 && (s[0] & 0xf0) == 0xe0
5191 && (s[1] & 0xc0) == 0x80
5192 && (s[2] & 0xc0) == 0x80)
5193 {
5194 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5195 s += 3;
5196 unicode[outpos++] = ch;
5197 }
5198 else {
5199 PyMem_RawFree(unicode );
5200 if (reason != NULL) {
5201 switch (ch) {
5202 case 0:
5203 *reason = "unexpected end of data";
5204 break;
5205 case 1:
5206 *reason = "invalid start byte";
5207 break;
5208 /* 2, 3, 4 */
5209 default:
5210 *reason = "invalid continuation byte";
5211 break;
5212 }
5213 }
5214 if (wlen != NULL) {
5215 *wlen = s - orig_s;
5216 }
5217 return -2;
5218 }
5219 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005220 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005221 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005222 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005223 if (wlen) {
5224 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005225 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005226 *wstr = unicode;
5227 return 0;
5228}
5229
Victor Stinner5f9cf232019-03-19 01:46:25 +01005230
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005231wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005232_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5233 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005234{
5235 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005236 int res = _Py_DecodeUTF8Ex(arg, arglen,
5237 &wstr, wlen,
5238 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005239 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005240 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5241 assert(res != -3);
5242 if (wlen) {
5243 *wlen = (size_t)res;
5244 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005245 return NULL;
5246 }
5247 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005248}
5249
Antoine Pitrouab868312009-01-10 15:40:25 +00005250
Victor Stinnere47e6982017-12-21 15:45:16 +01005251/* UTF-8 encoder using the surrogateescape error handler .
5252
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005253 On success, return 0 and write the newly allocated character string (use
5254 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005255
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005256 On encoding failure, return -2 and write the position of the invalid
5257 surrogate character into *error_pos (if error_pos is set) and the decoding
5258 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005259
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005260 On memory allocation failure, return -1. */
5261int
5262_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005263 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005264{
5265 const Py_ssize_t max_char_size = 4;
5266 Py_ssize_t len = wcslen(text);
5267
5268 assert(len >= 0);
5269
Victor Stinner3d4226a2018-08-29 22:21:32 +02005270 int surrogateescape = 0;
5271 int surrogatepass = 0;
5272 switch (errors)
5273 {
5274 case _Py_ERROR_STRICT:
5275 break;
5276 case _Py_ERROR_SURROGATEESCAPE:
5277 surrogateescape = 1;
5278 break;
5279 case _Py_ERROR_SURROGATEPASS:
5280 surrogatepass = 1;
5281 break;
5282 default:
5283 return -3;
5284 }
5285
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005286 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5287 return -1;
5288 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005289 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005290 if (raw_malloc) {
5291 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005292 }
5293 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005294 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005295 }
5296 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005297 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005298 }
5299
5300 char *p = bytes;
5301 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005302 for (i = 0; i < len; ) {
5303 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005304 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005305 i++;
5306#if Py_UNICODE_SIZE == 2
5307 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5308 && i < len
5309 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5310 {
5311 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5312 i++;
5313 }
5314#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005315
5316 if (ch < 0x80) {
5317 /* Encode ASCII */
5318 *p++ = (char) ch;
5319
5320 }
5321 else if (ch < 0x0800) {
5322 /* Encode Latin-1 */
5323 *p++ = (char)(0xc0 | (ch >> 6));
5324 *p++ = (char)(0x80 | (ch & 0x3f));
5325 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005326 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005327 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005328 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005329 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005330 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005331 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005332 if (reason != NULL) {
5333 *reason = "encoding error";
5334 }
5335 if (raw_malloc) {
5336 PyMem_RawFree(bytes);
5337 }
5338 else {
5339 PyMem_Free(bytes);
5340 }
5341 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005342 }
5343 *p++ = (char)(ch & 0xff);
5344 }
5345 else if (ch < 0x10000) {
5346 *p++ = (char)(0xe0 | (ch >> 12));
5347 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5348 *p++ = (char)(0x80 | (ch & 0x3f));
5349 }
5350 else { /* ch >= 0x10000 */
5351 assert(ch <= MAX_UNICODE);
5352 /* Encode UCS4 Unicode ordinals */
5353 *p++ = (char)(0xf0 | (ch >> 18));
5354 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5355 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5356 *p++ = (char)(0x80 | (ch & 0x3f));
5357 }
5358 }
5359 *p++ = '\0';
5360
5361 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005362 char *bytes2;
5363 if (raw_malloc) {
5364 bytes2 = PyMem_RawRealloc(bytes, final_size);
5365 }
5366 else {
5367 bytes2 = PyMem_Realloc(bytes, final_size);
5368 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005369 if (bytes2 == NULL) {
5370 if (error_pos != NULL) {
5371 *error_pos = (size_t)-1;
5372 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005373 if (raw_malloc) {
5374 PyMem_RawFree(bytes);
5375 }
5376 else {
5377 PyMem_Free(bytes);
5378 }
5379 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005380 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005381 *str = bytes2;
5382 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005383}
5384
5385
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005386/* Primary internal function which creates utf8 encoded bytes objects.
5387
5388 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005389 and allocate exactly as much space needed at the end. Else allocate the
5390 maximum possible needed (4 result bytes per Unicode character), and return
5391 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005392*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005393static PyObject *
5394unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5395 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005396{
Victor Stinner6099a032011-12-18 14:22:26 +01005397 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005398 void *data;
5399 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005400
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005401 if (!PyUnicode_Check(unicode)) {
5402 PyErr_BadArgument();
5403 return NULL;
5404 }
5405
5406 if (PyUnicode_READY(unicode) == -1)
5407 return NULL;
5408
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005409 if (PyUnicode_UTF8(unicode))
5410 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5411 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005412
5413 kind = PyUnicode_KIND(unicode);
5414 data = PyUnicode_DATA(unicode);
5415 size = PyUnicode_GET_LENGTH(unicode);
5416
Benjamin Petersonead6b532011-12-20 17:23:42 -06005417 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005418 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005419 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005420 case PyUnicode_1BYTE_KIND:
5421 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5422 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005423 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005424 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005425 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005426 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005427 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005428 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005429}
5430
Alexander Belopolsky40018472011-02-26 01:02:56 +00005431PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005432_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5433{
5434 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5435}
5436
5437
5438PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005439PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5440 Py_ssize_t size,
5441 const char *errors)
5442{
5443 PyObject *v, *unicode;
5444
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005445 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005446 if (unicode == NULL)
5447 return NULL;
5448 v = _PyUnicode_AsUTF8String(unicode, errors);
5449 Py_DECREF(unicode);
5450 return v;
5451}
5452
5453PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005454PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005455{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005456 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005457}
5458
Walter Dörwald41980ca2007-08-16 21:55:45 +00005459/* --- UTF-32 Codec ------------------------------------------------------- */
5460
5461PyObject *
5462PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005463 Py_ssize_t size,
5464 const char *errors,
5465 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005466{
5467 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5468}
5469
5470PyObject *
5471PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005472 Py_ssize_t size,
5473 const char *errors,
5474 int *byteorder,
5475 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005476{
5477 const char *starts = s;
5478 Py_ssize_t startinpos;
5479 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005480 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005481 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005482 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005483 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005484 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005485 PyObject *errorHandler = NULL;
5486 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005487
Walter Dörwald41980ca2007-08-16 21:55:45 +00005488 q = (unsigned char *)s;
5489 e = q + size;
5490
5491 if (byteorder)
5492 bo = *byteorder;
5493
5494 /* Check for BOM marks (U+FEFF) in the input and adjust current
5495 byte order setting accordingly. In native mode, the leading BOM
5496 mark is skipped, in all other modes, it is copied to the output
5497 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005498 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005499 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005500 if (bom == 0x0000FEFF) {
5501 bo = -1;
5502 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005503 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005504 else if (bom == 0xFFFE0000) {
5505 bo = 1;
5506 q += 4;
5507 }
5508 if (byteorder)
5509 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005510 }
5511
Victor Stinnere64322e2012-10-30 23:12:47 +01005512 if (q == e) {
5513 if (consumed)
5514 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005515 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005516 }
5517
Victor Stinnere64322e2012-10-30 23:12:47 +01005518#ifdef WORDS_BIGENDIAN
5519 le = bo < 0;
5520#else
5521 le = bo <= 0;
5522#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005523 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005524
Victor Stinner8f674cc2013-04-17 23:02:17 +02005525 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005526 writer.min_length = (e - q + 3) / 4;
5527 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005528 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005529
Victor Stinnere64322e2012-10-30 23:12:47 +01005530 while (1) {
5531 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005532 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005533
Victor Stinnere64322e2012-10-30 23:12:47 +01005534 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005535 enum PyUnicode_Kind kind = writer.kind;
5536 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005537 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005538 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005539 if (le) {
5540 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005541 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005542 if (ch > maxch)
5543 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005544 if (kind != PyUnicode_1BYTE_KIND &&
5545 Py_UNICODE_IS_SURROGATE(ch))
5546 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005547 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005548 q += 4;
5549 } while (q <= last);
5550 }
5551 else {
5552 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005553 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005554 if (ch > maxch)
5555 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005556 if (kind != PyUnicode_1BYTE_KIND &&
5557 Py_UNICODE_IS_SURROGATE(ch))
5558 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005559 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005560 q += 4;
5561 } while (q <= last);
5562 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005563 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005564 }
5565
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005566 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005567 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005568 startinpos = ((const char *)q) - starts;
5569 endinpos = startinpos + 4;
5570 }
5571 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005573 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005574 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005575 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005576 startinpos = ((const char *)q) - starts;
5577 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005578 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005579 else {
5580 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005581 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005582 goto onError;
5583 q += 4;
5584 continue;
5585 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005586 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005587 startinpos = ((const char *)q) - starts;
5588 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005589 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005590
5591 /* The remaining input chars are ignored if the callback
5592 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005593 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005594 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005595 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005596 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005597 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005598 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005599 }
5600
Walter Dörwald41980ca2007-08-16 21:55:45 +00005601 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005602 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005603
Walter Dörwald41980ca2007-08-16 21:55:45 +00005604 Py_XDECREF(errorHandler);
5605 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005606 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005607
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005609 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005610 Py_XDECREF(errorHandler);
5611 Py_XDECREF(exc);
5612 return NULL;
5613}
5614
5615PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005616_PyUnicode_EncodeUTF32(PyObject *str,
5617 const char *errors,
5618 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005619{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005620 enum PyUnicode_Kind kind;
5621 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005622 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005623 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005624 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005625#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005626 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005627#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005628 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005629#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005630 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005631 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005632 PyObject *errorHandler = NULL;
5633 PyObject *exc = NULL;
5634 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005635
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005636 if (!PyUnicode_Check(str)) {
5637 PyErr_BadArgument();
5638 return NULL;
5639 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005640 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005641 return NULL;
5642 kind = PyUnicode_KIND(str);
5643 data = PyUnicode_DATA(str);
5644 len = PyUnicode_GET_LENGTH(str);
5645
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005646 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005647 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005649 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005650 if (v == NULL)
5651 return NULL;
5652
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005653 /* output buffer is 4-bytes aligned */
5654 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005655 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005656 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005657 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005658 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005659 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005660
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005661 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005662 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005663 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005664 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 else
5666 encoding = "utf-32";
5667
5668 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005669 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5670 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005671 }
5672
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005673 pos = 0;
5674 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005675 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005676
5677 if (kind == PyUnicode_2BYTE_KIND) {
5678 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5679 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005680 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005681 else {
5682 assert(kind == PyUnicode_4BYTE_KIND);
5683 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5684 &out, native_ordering);
5685 }
5686 if (pos == len)
5687 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005688
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005689 rep = unicode_encode_call_errorhandler(
5690 errors, &errorHandler,
5691 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005692 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005693 if (!rep)
5694 goto error;
5695
5696 if (PyBytes_Check(rep)) {
5697 repsize = PyBytes_GET_SIZE(rep);
5698 if (repsize & 3) {
5699 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005700 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005701 "surrogates not allowed");
5702 goto error;
5703 }
5704 moreunits = repsize / 4;
5705 }
5706 else {
5707 assert(PyUnicode_Check(rep));
5708 if (PyUnicode_READY(rep) < 0)
5709 goto error;
5710 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5711 if (!PyUnicode_IS_ASCII(rep)) {
5712 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005713 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005714 "surrogates not allowed");
5715 goto error;
5716 }
5717 }
5718
5719 /* four bytes are reserved for each surrogate */
5720 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005721 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005722 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005723 /* integer overflow */
5724 PyErr_NoMemory();
5725 goto error;
5726 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005727 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005728 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005729 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005730 }
5731
5732 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005733 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005734 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005735 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005736 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005737 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5738 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005739 }
5740
5741 Py_CLEAR(rep);
5742 }
5743
5744 /* Cut back to size actually needed. This is necessary for, for example,
5745 encoding of a string containing isolated surrogates and the 'ignore'
5746 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005747 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005748 if (nsize != PyBytes_GET_SIZE(v))
5749 _PyBytes_Resize(&v, nsize);
5750 Py_XDECREF(errorHandler);
5751 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005752 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005753 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005754 error:
5755 Py_XDECREF(rep);
5756 Py_XDECREF(errorHandler);
5757 Py_XDECREF(exc);
5758 Py_XDECREF(v);
5759 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005760}
5761
Alexander Belopolsky40018472011-02-26 01:02:56 +00005762PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005763PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5764 Py_ssize_t size,
5765 const char *errors,
5766 int byteorder)
5767{
5768 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005769 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005770 if (tmp == NULL)
5771 return NULL;
5772 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5773 Py_DECREF(tmp);
5774 return result;
5775}
5776
5777PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005778PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005779{
Victor Stinnerb960b342011-11-20 19:12:52 +01005780 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005781}
5782
Guido van Rossumd57fd912000-03-10 22:53:23 +00005783/* --- UTF-16 Codec ------------------------------------------------------- */
5784
Tim Peters772747b2001-08-09 22:21:55 +00005785PyObject *
5786PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005787 Py_ssize_t size,
5788 const char *errors,
5789 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005790{
Walter Dörwald69652032004-09-07 20:24:22 +00005791 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5792}
5793
5794PyObject *
5795PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005796 Py_ssize_t size,
5797 const char *errors,
5798 int *byteorder,
5799 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005800{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005801 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005802 Py_ssize_t startinpos;
5803 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005804 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005805 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005806 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005807 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005808 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005809 PyObject *errorHandler = NULL;
5810 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005811 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005812
Tim Peters772747b2001-08-09 22:21:55 +00005813 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005814 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005815
5816 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005817 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005818
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005819 /* Check for BOM marks (U+FEFF) in the input and adjust current
5820 byte order setting accordingly. In native mode, the leading BOM
5821 mark is skipped, in all other modes, it is copied to the output
5822 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005823 if (bo == 0 && size >= 2) {
5824 const Py_UCS4 bom = (q[1] << 8) | q[0];
5825 if (bom == 0xFEFF) {
5826 q += 2;
5827 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005829 else if (bom == 0xFFFE) {
5830 q += 2;
5831 bo = 1;
5832 }
5833 if (byteorder)
5834 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005835 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005836
Antoine Pitrou63065d72012-05-15 23:48:04 +02005837 if (q == e) {
5838 if (consumed)
5839 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005840 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005841 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005842
Christian Heimes743e0cd2012-10-17 23:52:17 +02005843#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005844 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005845 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005846#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005847 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005848 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005849#endif
Tim Peters772747b2001-08-09 22:21:55 +00005850
Antoine Pitrou63065d72012-05-15 23:48:04 +02005851 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005852 character count normally. Error handler will take care of
5853 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005854 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005855 writer.min_length = (e - q + 1) / 2;
5856 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005857 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005858
Antoine Pitrou63065d72012-05-15 23:48:04 +02005859 while (1) {
5860 Py_UCS4 ch = 0;
5861 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005862 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005863 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005864 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005865 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005866 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005867 native_ordering);
5868 else
5869 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005870 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005871 native_ordering);
5872 } else if (kind == PyUnicode_2BYTE_KIND) {
5873 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005874 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005875 native_ordering);
5876 } else {
5877 assert(kind == PyUnicode_4BYTE_KIND);
5878 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005879 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005880 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005881 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005882 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005883
Antoine Pitrou63065d72012-05-15 23:48:04 +02005884 switch (ch)
5885 {
5886 case 0:
5887 /* remaining byte at the end? (size should be even) */
5888 if (q == e || consumed)
5889 goto End;
5890 errmsg = "truncated data";
5891 startinpos = ((const char *)q) - starts;
5892 endinpos = ((const char *)e) - starts;
5893 break;
5894 /* The remaining input chars are ignored if the callback
5895 chooses to skip the input */
5896 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005897 q -= 2;
5898 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005899 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005900 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005901 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005902 endinpos = ((const char *)e) - starts;
5903 break;
5904 case 2:
5905 errmsg = "illegal encoding";
5906 startinpos = ((const char *)q) - 2 - starts;
5907 endinpos = startinpos + 2;
5908 break;
5909 case 3:
5910 errmsg = "illegal UTF-16 surrogate";
5911 startinpos = ((const char *)q) - 4 - starts;
5912 endinpos = startinpos + 2;
5913 break;
5914 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005915 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005916 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005917 continue;
5918 }
5919
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005920 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005921 errors,
5922 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005923 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005924 &starts,
5925 (const char **)&e,
5926 &startinpos,
5927 &endinpos,
5928 &exc,
5929 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005930 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005931 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005932 }
5933
Antoine Pitrou63065d72012-05-15 23:48:04 +02005934End:
Walter Dörwald69652032004-09-07 20:24:22 +00005935 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005936 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005937
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005938 Py_XDECREF(errorHandler);
5939 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005940 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005941
Benjamin Peterson29060642009-01-31 22:14:21 +00005942 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005943 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005944 Py_XDECREF(errorHandler);
5945 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005946 return NULL;
5947}
5948
Tim Peters772747b2001-08-09 22:21:55 +00005949PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005950_PyUnicode_EncodeUTF16(PyObject *str,
5951 const char *errors,
5952 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005953{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005954 enum PyUnicode_Kind kind;
5955 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005956 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005957 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005958 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005959 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005960#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005961 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005962#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005963 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005964#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005965 const char *encoding;
5966 Py_ssize_t nsize, pos;
5967 PyObject *errorHandler = NULL;
5968 PyObject *exc = NULL;
5969 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005970
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005971 if (!PyUnicode_Check(str)) {
5972 PyErr_BadArgument();
5973 return NULL;
5974 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005975 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005976 return NULL;
5977 kind = PyUnicode_KIND(str);
5978 data = PyUnicode_DATA(str);
5979 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005980
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005981 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005982 if (kind == PyUnicode_4BYTE_KIND) {
5983 const Py_UCS4 *in = (const Py_UCS4 *)data;
5984 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005985 while (in < end) {
5986 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005987 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005988 }
5989 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005990 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005991 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005992 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005993 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005994 nsize = len + pairs + (byteorder == 0);
5995 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005996 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005997 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005998 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005999
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006000 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006001 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006002 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006003 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006004 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006005 }
6006 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006007 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006008 }
Tim Peters772747b2001-08-09 22:21:55 +00006009
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006010 if (kind == PyUnicode_1BYTE_KIND) {
6011 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6012 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006013 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006014
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006015 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006016 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006017 }
6018 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006019 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006020 }
6021 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006022 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006023 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006024
6025 pos = 0;
6026 while (pos < len) {
6027 Py_ssize_t repsize, moreunits;
6028
6029 if (kind == PyUnicode_2BYTE_KIND) {
6030 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6031 &out, native_ordering);
6032 }
6033 else {
6034 assert(kind == PyUnicode_4BYTE_KIND);
6035 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6036 &out, native_ordering);
6037 }
6038 if (pos == len)
6039 break;
6040
6041 rep = unicode_encode_call_errorhandler(
6042 errors, &errorHandler,
6043 encoding, "surrogates not allowed",
6044 str, &exc, pos, pos + 1, &pos);
6045 if (!rep)
6046 goto error;
6047
6048 if (PyBytes_Check(rep)) {
6049 repsize = PyBytes_GET_SIZE(rep);
6050 if (repsize & 1) {
6051 raise_encode_exception(&exc, encoding,
6052 str, pos - 1, pos,
6053 "surrogates not allowed");
6054 goto error;
6055 }
6056 moreunits = repsize / 2;
6057 }
6058 else {
6059 assert(PyUnicode_Check(rep));
6060 if (PyUnicode_READY(rep) < 0)
6061 goto error;
6062 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6063 if (!PyUnicode_IS_ASCII(rep)) {
6064 raise_encode_exception(&exc, encoding,
6065 str, pos - 1, pos,
6066 "surrogates not allowed");
6067 goto error;
6068 }
6069 }
6070
6071 /* two bytes are reserved for each surrogate */
6072 if (moreunits > 1) {
6073 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006074 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006075 /* integer overflow */
6076 PyErr_NoMemory();
6077 goto error;
6078 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006079 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006080 goto error;
6081 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6082 }
6083
6084 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006085 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006086 out += moreunits;
6087 } else /* rep is unicode */ {
6088 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6089 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6090 &out, native_ordering);
6091 }
6092
6093 Py_CLEAR(rep);
6094 }
6095
6096 /* Cut back to size actually needed. This is necessary for, for example,
6097 encoding of a string containing isolated surrogates and the 'ignore' handler
6098 is used. */
6099 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6100 if (nsize != PyBytes_GET_SIZE(v))
6101 _PyBytes_Resize(&v, nsize);
6102 Py_XDECREF(errorHandler);
6103 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006104 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006105 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006106 error:
6107 Py_XDECREF(rep);
6108 Py_XDECREF(errorHandler);
6109 Py_XDECREF(exc);
6110 Py_XDECREF(v);
6111 return NULL;
6112#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006113}
6114
Alexander Belopolsky40018472011-02-26 01:02:56 +00006115PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006116PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6117 Py_ssize_t size,
6118 const char *errors,
6119 int byteorder)
6120{
6121 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006122 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006123 if (tmp == NULL)
6124 return NULL;
6125 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6126 Py_DECREF(tmp);
6127 return result;
6128}
6129
6130PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006131PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006132{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006133 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006134}
6135
6136/* --- Unicode Escape Codec ----------------------------------------------- */
6137
Fredrik Lundh06d12682001-01-24 07:59:11 +00006138static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006139
Alexander Belopolsky40018472011-02-26 01:02:56 +00006140PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006141_PyUnicode_DecodeUnicodeEscape(const char *s,
6142 Py_ssize_t size,
6143 const char *errors,
6144 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006146 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006147 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006148 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006149 PyObject *errorHandler = NULL;
6150 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006151
Eric V. Smith42454af2016-10-31 09:22:08 -04006152 // so we can remember if we've seen an invalid escape char or not
6153 *first_invalid_escape = NULL;
6154
Victor Stinner62ec3312016-09-06 17:04:34 -07006155 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006156 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006157 }
6158 /* Escaped strings will always be longer than the resulting
6159 Unicode string, so we start with size here and then reduce the
6160 length after conversion to the true value.
6161 (but if the error callback returns a long replacement string
6162 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006163 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006164 writer.min_length = size;
6165 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6166 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006167 }
6168
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169 end = s + size;
6170 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 unsigned char c = (unsigned char) *s++;
6172 Py_UCS4 ch;
6173 int count;
6174 Py_ssize_t startinpos;
6175 Py_ssize_t endinpos;
6176 const char *message;
6177
6178#define WRITE_ASCII_CHAR(ch) \
6179 do { \
6180 assert(ch <= 127); \
6181 assert(writer.pos < writer.size); \
6182 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6183 } while(0)
6184
6185#define WRITE_CHAR(ch) \
6186 do { \
6187 if (ch <= writer.maxchar) { \
6188 assert(writer.pos < writer.size); \
6189 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6190 } \
6191 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6192 goto onError; \
6193 } \
6194 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006195
6196 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 if (c != '\\') {
6198 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006199 continue;
6200 }
6201
Victor Stinner62ec3312016-09-06 17:04:34 -07006202 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006203 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006204 if (s >= end) {
6205 message = "\\ at end of string";
6206 goto error;
6207 }
6208 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006209
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006211 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006212
Benjamin Peterson29060642009-01-31 22:14:21 +00006213 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006214 case '\n': continue;
6215 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6216 case '\'': WRITE_ASCII_CHAR('\''); continue;
6217 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6218 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006219 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006220 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6221 case 't': WRITE_ASCII_CHAR('\t'); continue;
6222 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6223 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006224 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006225 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006226 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006227 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
Benjamin Peterson29060642009-01-31 22:14:21 +00006229 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006230 case '0': case '1': case '2': case '3':
6231 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006233 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006234 ch = (ch<<3) + *s++ - '0';
6235 if (s < end && '0' <= *s && *s <= '7') {
6236 ch = (ch<<3) + *s++ - '0';
6237 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006238 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006239 WRITE_CHAR(ch);
6240 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006241
Benjamin Peterson29060642009-01-31 22:14:21 +00006242 /* hex escapes */
6243 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006244 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006245 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006246 message = "truncated \\xXX escape";
6247 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006248
Benjamin Peterson29060642009-01-31 22:14:21 +00006249 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006250 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006252 message = "truncated \\uXXXX escape";
6253 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006254
Benjamin Peterson29060642009-01-31 22:14:21 +00006255 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006256 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006257 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006258 message = "truncated \\UXXXXXXXX escape";
6259 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006261 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006262 ch <<= 4;
6263 if (c >= '0' && c <= '9') {
6264 ch += c - '0';
6265 }
6266 else if (c >= 'a' && c <= 'f') {
6267 ch += c - ('a' - 10);
6268 }
6269 else if (c >= 'A' && c <= 'F') {
6270 ch += c - ('A' - 10);
6271 }
6272 else {
6273 break;
6274 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006275 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006276 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006277 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006278 }
6279
6280 /* when we get here, ch is a 32-bit unicode character */
6281 if (ch > MAX_UNICODE) {
6282 message = "illegal Unicode character";
6283 goto error;
6284 }
6285
6286 WRITE_CHAR(ch);
6287 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006288
Benjamin Peterson29060642009-01-31 22:14:21 +00006289 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006290 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006291 if (ucnhash_CAPI == NULL) {
6292 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006293 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6294 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 if (ucnhash_CAPI == NULL) {
6296 PyErr_SetString(
6297 PyExc_UnicodeError,
6298 "\\N escapes not supported (can't load unicodedata module)"
6299 );
6300 goto onError;
6301 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006302 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006303
6304 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006305 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006306 const char *start = ++s;
6307 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006308 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006309 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006310 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 namelen = s - start;
6312 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006313 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006314 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 ch = 0xffffffff; /* in case 'getcode' messes up */
6316 if (namelen <= INT_MAX &&
6317 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6318 &ch, 0)) {
6319 assert(ch <= MAX_UNICODE);
6320 WRITE_CHAR(ch);
6321 continue;
6322 }
6323 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006324 }
6325 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006326 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006327
6328 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006329 if (*first_invalid_escape == NULL) {
6330 *first_invalid_escape = s-1; /* Back up one char, since we've
6331 already incremented s. */
6332 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 WRITE_ASCII_CHAR('\\');
6334 WRITE_CHAR(c);
6335 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006336 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006337
6338 error:
6339 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006341 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006342 errors, &errorHandler,
6343 "unicodeescape", message,
6344 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006345 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006346 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006348 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006349
6350#undef WRITE_ASCII_CHAR
6351#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006353
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006354 Py_XDECREF(errorHandler);
6355 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006356 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006357
Benjamin Peterson29060642009-01-31 22:14:21 +00006358 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006359 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006360 Py_XDECREF(errorHandler);
6361 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006362 return NULL;
6363}
6364
Eric V. Smith42454af2016-10-31 09:22:08 -04006365PyObject *
6366PyUnicode_DecodeUnicodeEscape(const char *s,
6367 Py_ssize_t size,
6368 const char *errors)
6369{
6370 const char *first_invalid_escape;
6371 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6372 &first_invalid_escape);
6373 if (result == NULL)
6374 return NULL;
6375 if (first_invalid_escape != NULL) {
6376 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6377 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006378 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006379 Py_DECREF(result);
6380 return NULL;
6381 }
6382 }
6383 return result;
6384}
6385
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006386/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006387
Alexander Belopolsky40018472011-02-26 01:02:56 +00006388PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006389PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006390{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006391 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006392 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006393 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006395 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006397
Ezio Melottie7f90372012-10-05 03:33:31 +03006398 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006399 escape.
6400
Ezio Melottie7f90372012-10-05 03:33:31 +03006401 For UCS1 strings it's '\xxx', 4 bytes per source character.
6402 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6403 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006404 */
6405
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006406 if (!PyUnicode_Check(unicode)) {
6407 PyErr_BadArgument();
6408 return NULL;
6409 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006411 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006412 }
Victor Stinner358af132015-10-12 22:36:57 +02006413
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006414 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006415 if (len == 0) {
6416 return PyBytes_FromStringAndSize(NULL, 0);
6417 }
6418
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006419 kind = PyUnicode_KIND(unicode);
6420 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006421 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6422 bytes, and 1 byte characters 4. */
6423 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006424 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 return PyErr_NoMemory();
6426 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006427 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006428 if (repr == NULL) {
6429 return NULL;
6430 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006431
Victor Stinner62ec3312016-09-06 17:04:34 -07006432 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006433 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006434 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006435
Victor Stinner62ec3312016-09-06 17:04:34 -07006436 /* U+0000-U+00ff range */
6437 if (ch < 0x100) {
6438 if (ch >= ' ' && ch < 127) {
6439 if (ch != '\\') {
6440 /* Copy printable US ASCII as-is */
6441 *p++ = (char) ch;
6442 }
6443 /* Escape backslashes */
6444 else {
6445 *p++ = '\\';
6446 *p++ = '\\';
6447 }
6448 }
Victor Stinner358af132015-10-12 22:36:57 +02006449
Victor Stinner62ec3312016-09-06 17:04:34 -07006450 /* Map special whitespace to '\t', \n', '\r' */
6451 else if (ch == '\t') {
6452 *p++ = '\\';
6453 *p++ = 't';
6454 }
6455 else if (ch == '\n') {
6456 *p++ = '\\';
6457 *p++ = 'n';
6458 }
6459 else if (ch == '\r') {
6460 *p++ = '\\';
6461 *p++ = 'r';
6462 }
6463
6464 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6465 else {
6466 *p++ = '\\';
6467 *p++ = 'x';
6468 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6469 *p++ = Py_hexdigits[ch & 0x000F];
6470 }
Tim Petersced69f82003-09-16 20:30:58 +00006471 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006472 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006473 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006474 *p++ = '\\';
6475 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006476 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6477 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6478 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6479 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006480 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006481 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6482 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006483
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 /* Make sure that the first two digits are zero */
6485 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006486 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006487 *p++ = 'U';
6488 *p++ = '0';
6489 *p++ = '0';
6490 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6491 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6492 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6493 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6494 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6495 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006496 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006497 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006498
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 assert(p - PyBytes_AS_STRING(repr) > 0);
6500 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6501 return NULL;
6502 }
6503 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006504}
6505
Alexander Belopolsky40018472011-02-26 01:02:56 +00006506PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006507PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6508 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006509{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006510 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006511 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006512 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006514 }
6515
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006516 result = PyUnicode_AsUnicodeEscapeString(tmp);
6517 Py_DECREF(tmp);
6518 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006519}
6520
6521/* --- Raw Unicode Escape Codec ------------------------------------------- */
6522
Alexander Belopolsky40018472011-02-26 01:02:56 +00006523PyObject *
6524PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006525 Py_ssize_t size,
6526 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006527{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006528 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006529 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006531 PyObject *errorHandler = NULL;
6532 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006533
Victor Stinner62ec3312016-09-06 17:04:34 -07006534 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006535 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006536 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006537
Guido van Rossumd57fd912000-03-10 22:53:23 +00006538 /* Escaped strings will always be longer than the resulting
6539 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006540 length after conversion to the true value. (But decoding error
6541 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006542 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006543 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006544 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6545 goto onError;
6546 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006547
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 end = s + size;
6549 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006550 unsigned char c = (unsigned char) *s++;
6551 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006552 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006553 Py_ssize_t startinpos;
6554 Py_ssize_t endinpos;
6555 const char *message;
6556
6557#define WRITE_CHAR(ch) \
6558 do { \
6559 if (ch <= writer.maxchar) { \
6560 assert(writer.pos < writer.size); \
6561 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6562 } \
6563 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6564 goto onError; \
6565 } \
6566 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567
Benjamin Peterson29060642009-01-31 22:14:21 +00006568 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 if (c != '\\' || s >= end) {
6570 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006571 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006572 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006573
Victor Stinner62ec3312016-09-06 17:04:34 -07006574 c = (unsigned char) *s++;
6575 if (c == 'u') {
6576 count = 4;
6577 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006578 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 else if (c == 'U') {
6580 count = 8;
6581 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006582 }
6583 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006584 assert(writer.pos < writer.size);
6585 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6586 WRITE_CHAR(c);
6587 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006588 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006589 startinpos = s - starts - 2;
6590
6591 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6592 for (ch = 0; count && s < end; ++s, --count) {
6593 c = (unsigned char)*s;
6594 ch <<= 4;
6595 if (c >= '0' && c <= '9') {
6596 ch += c - '0';
6597 }
6598 else if (c >= 'a' && c <= 'f') {
6599 ch += c - ('a' - 10);
6600 }
6601 else if (c >= 'A' && c <= 'F') {
6602 ch += c - ('A' - 10);
6603 }
6604 else {
6605 break;
6606 }
6607 }
6608 if (!count) {
6609 if (ch <= MAX_UNICODE) {
6610 WRITE_CHAR(ch);
6611 continue;
6612 }
6613 message = "\\Uxxxxxxxx out of range";
6614 }
6615
6616 endinpos = s-starts;
6617 writer.min_length = end - s + writer.pos;
6618 if (unicode_decode_call_errorhandler_writer(
6619 errors, &errorHandler,
6620 "rawunicodeescape", message,
6621 &starts, &end, &startinpos, &endinpos, &exc, &s,
6622 &writer)) {
6623 goto onError;
6624 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006625 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006626
6627#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006628 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006629 Py_XDECREF(errorHandler);
6630 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006631 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006632
Benjamin Peterson29060642009-01-31 22:14:21 +00006633 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006634 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006635 Py_XDECREF(errorHandler);
6636 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006637 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006638
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639}
6640
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006641
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006643PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006644{
Victor Stinner62ec3312016-09-06 17:04:34 -07006645 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006646 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006647 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006648 int kind;
6649 void *data;
6650 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006652 if (!PyUnicode_Check(unicode)) {
6653 PyErr_BadArgument();
6654 return NULL;
6655 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006656 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006657 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006658 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006659 kind = PyUnicode_KIND(unicode);
6660 data = PyUnicode_DATA(unicode);
6661 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006662 if (kind == PyUnicode_1BYTE_KIND) {
6663 return PyBytes_FromStringAndSize(data, len);
6664 }
Victor Stinner0e368262011-11-10 20:12:49 +01006665
Victor Stinner62ec3312016-09-06 17:04:34 -07006666 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6667 bytes, and 1 byte characters 4. */
6668 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006669
Victor Stinner62ec3312016-09-06 17:04:34 -07006670 if (len > PY_SSIZE_T_MAX / expandsize) {
6671 return PyErr_NoMemory();
6672 }
6673 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6674 if (repr == NULL) {
6675 return NULL;
6676 }
6677 if (len == 0) {
6678 return repr;
6679 }
6680
6681 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006682 for (pos = 0; pos < len; pos++) {
6683 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006684
Victor Stinner62ec3312016-09-06 17:04:34 -07006685 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6686 if (ch < 0x100) {
6687 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006688 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006689 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006690 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006691 *p++ = '\\';
6692 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006693 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6694 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6695 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6696 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006697 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006698 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6699 else {
6700 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6701 *p++ = '\\';
6702 *p++ = 'U';
6703 *p++ = '0';
6704 *p++ = '0';
6705 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6706 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6707 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6708 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6709 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6710 *p++ = Py_hexdigits[ch & 15];
6711 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006712 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006713
Victor Stinner62ec3312016-09-06 17:04:34 -07006714 assert(p > PyBytes_AS_STRING(repr));
6715 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6716 return NULL;
6717 }
6718 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006719}
6720
Alexander Belopolsky40018472011-02-26 01:02:56 +00006721PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006722PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6723 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006725 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006726 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006727 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006728 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006729 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6730 Py_DECREF(tmp);
6731 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006732}
6733
6734/* --- Latin-1 Codec ------------------------------------------------------ */
6735
Alexander Belopolsky40018472011-02-26 01:02:56 +00006736PyObject *
6737PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006738 Py_ssize_t size,
6739 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006740{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006741 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006742 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006743}
6744
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006745/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006746static void
6747make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006748 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006749 PyObject *unicode,
6750 Py_ssize_t startpos, Py_ssize_t endpos,
6751 const char *reason)
6752{
6753 if (*exceptionObject == NULL) {
6754 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006755 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006756 encoding, unicode, startpos, endpos, reason);
6757 }
6758 else {
6759 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6760 goto onError;
6761 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6762 goto onError;
6763 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6764 goto onError;
6765 return;
6766 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006767 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006768 }
6769}
6770
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006771/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006772static void
6773raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006774 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006775 PyObject *unicode,
6776 Py_ssize_t startpos, Py_ssize_t endpos,
6777 const char *reason)
6778{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006779 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006780 encoding, unicode, startpos, endpos, reason);
6781 if (*exceptionObject != NULL)
6782 PyCodec_StrictErrors(*exceptionObject);
6783}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784
6785/* error handling callback helper:
6786 build arguments, call the callback and check the arguments,
6787 put the result into newpos and return the replacement string, which
6788 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006789static PyObject *
6790unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006791 PyObject **errorHandler,
6792 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006793 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006794 Py_ssize_t startpos, Py_ssize_t endpos,
6795 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006796{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006797 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006799 PyObject *restuple;
6800 PyObject *resunicode;
6801
6802 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006806 }
6807
Benjamin Petersonbac79492012-01-14 13:34:47 -05006808 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006809 return NULL;
6810 len = PyUnicode_GET_LENGTH(unicode);
6811
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006812 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006813 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006814 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006816
Jeroen Demeyer196a5302019-07-04 12:31:34 +02006817 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006818 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006819 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006820 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006821 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006822 Py_DECREF(restuple);
6823 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006824 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006825 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 &resunicode, newpos)) {
6827 Py_DECREF(restuple);
6828 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006829 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006830 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6831 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6832 Py_DECREF(restuple);
6833 return NULL;
6834 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006835 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006836 *newpos = len + *newpos;
6837 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006838 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006839 Py_DECREF(restuple);
6840 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006841 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006842 Py_INCREF(resunicode);
6843 Py_DECREF(restuple);
6844 return resunicode;
6845}
6846
Alexander Belopolsky40018472011-02-26 01:02:56 +00006847static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006848unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006849 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006850 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006852 /* input state */
6853 Py_ssize_t pos=0, size;
6854 int kind;
6855 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006856 /* pointer into the output */
6857 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006858 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6859 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006860 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006861 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006862 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006863 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006864 /* output object */
6865 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006866
Benjamin Petersonbac79492012-01-14 13:34:47 -05006867 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006868 return NULL;
6869 size = PyUnicode_GET_LENGTH(unicode);
6870 kind = PyUnicode_KIND(unicode);
6871 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006872 /* allocate enough for a simple encoding without
6873 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006874 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006875 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006876
6877 _PyBytesWriter_Init(&writer);
6878 str = _PyBytesWriter_Alloc(&writer, size);
6879 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006880 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006881
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006882 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006883 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006884
Benjamin Peterson29060642009-01-31 22:14:21 +00006885 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006886 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006887 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006888 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006889 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006890 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006891 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006892 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006894 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006895 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006896 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006897
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006898 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006900
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006901 /* Only overallocate the buffer if it's not the last write */
6902 writer.overallocate = (collend < size);
6903
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006905 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006906 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006907
6908 switch (error_handler) {
6909 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006910 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006911 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006912
6913 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006914 memset(str, '?', collend - collstart);
6915 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006916 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006917 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006918 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006919 break;
Victor Stinner50149202015-09-22 00:26:54 +02006920
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006921 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006922 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006923 writer.min_size -= (collend - collstart);
6924 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006925 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006926 if (str == NULL)
6927 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006928 pos = collend;
6929 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006930
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006931 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006932 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006933 writer.min_size -= (collend - collstart);
6934 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006935 unicode, collstart, collend);
6936 if (str == NULL)
6937 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006938 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006939 break;
Victor Stinner50149202015-09-22 00:26:54 +02006940
Victor Stinnerc3713e92015-09-29 12:32:13 +02006941 case _Py_ERROR_SURROGATEESCAPE:
6942 for (i = collstart; i < collend; ++i) {
6943 ch = PyUnicode_READ(kind, data, i);
6944 if (ch < 0xdc80 || 0xdcff < ch) {
6945 /* Not a UTF-8b surrogate */
6946 break;
6947 }
6948 *str++ = (char)(ch - 0xdc00);
6949 ++pos;
6950 }
6951 if (i >= collend)
6952 break;
6953 collstart = pos;
6954 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006955 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006956
Benjamin Peterson29060642009-01-31 22:14:21 +00006957 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006958 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6959 encoding, reason, unicode, &exc,
6960 collstart, collend, &newpos);
6961 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006962 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006963
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006964 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006965 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006966
Victor Stinner6bd525b2015-10-09 13:10:05 +02006967 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006968 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006969 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006970 PyBytes_AS_STRING(rep),
6971 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006972 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006973 else {
6974 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006975
Victor Stinner6bd525b2015-10-09 13:10:05 +02006976 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006977 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006978
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006979 if (limit == 256 ?
6980 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6981 !PyUnicode_IS_ASCII(rep))
6982 {
6983 /* Not all characters are smaller than limit */
6984 raise_encode_exception(&exc, encoding, unicode,
6985 collstart, collend, reason);
6986 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006987 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006988 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6989 str = _PyBytesWriter_WriteBytes(&writer, str,
6990 PyUnicode_DATA(rep),
6991 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006992 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006993 if (str == NULL)
6994 goto onError;
6995
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006996 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006997 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006998 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006999
7000 /* If overallocation was disabled, ensure that it was the last
7001 write. Otherwise, we missed an optimization */
7002 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007003 }
7004 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007005
Victor Stinner50149202015-09-22 00:26:54 +02007006 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007007 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007008 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007009
7010 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007011 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007012 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007013 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007014 Py_XDECREF(exc);
7015 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007016}
7017
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007018/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007019PyObject *
7020PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007021 Py_ssize_t size,
7022 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007023{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007024 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007025 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007026 if (unicode == NULL)
7027 return NULL;
7028 result = unicode_encode_ucs1(unicode, errors, 256);
7029 Py_DECREF(unicode);
7030 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007031}
7032
Alexander Belopolsky40018472011-02-26 01:02:56 +00007033PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007034_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007035{
7036 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007037 PyErr_BadArgument();
7038 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007040 if (PyUnicode_READY(unicode) == -1)
7041 return NULL;
7042 /* Fast path: if it is a one-byte string, construct
7043 bytes object directly. */
7044 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7045 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7046 PyUnicode_GET_LENGTH(unicode));
7047 /* Non-Latin-1 characters present. Defer to above function to
7048 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007049 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007050}
7051
7052PyObject*
7053PyUnicode_AsLatin1String(PyObject *unicode)
7054{
7055 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056}
7057
7058/* --- 7-bit ASCII Codec -------------------------------------------------- */
7059
Alexander Belopolsky40018472011-02-26 01:02:56 +00007060PyObject *
7061PyUnicode_DecodeASCII(const char *s,
7062 Py_ssize_t size,
7063 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007066 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007067 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007068 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007069 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007070
Guido van Rossumd57fd912000-03-10 22:53:23 +00007071 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007072 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007073
Guido van Rossumd57fd912000-03-10 22:53:23 +00007074 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007075 if (size == 1 && (unsigned char)s[0] < 128)
7076 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007077
Inada Naoki770847a2019-06-24 12:30:24 +09007078 // Shortcut for simple case
7079 PyObject *u = PyUnicode_New(size, 127);
7080 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007081 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007082 }
7083 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7084 if (outpos == size) {
7085 return u;
7086 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007087
Inada Naoki770847a2019-06-24 12:30:24 +09007088 _PyUnicodeWriter writer;
7089 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007090 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007091
Inada Naoki770847a2019-06-24 12:30:24 +09007092 s += outpos;
7093 int kind = writer.kind;
7094 void *data = writer.data;
7095 Py_ssize_t startinpos, endinpos;
7096
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007097 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007098 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007099 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007100 PyUnicode_WRITE(kind, data, writer.pos, c);
7101 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007102 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007103 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007104 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007105
7106 /* byte outsize range 0x00..0x7f: call the error handler */
7107
7108 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007109 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007110
7111 switch (error_handler)
7112 {
7113 case _Py_ERROR_REPLACE:
7114 case _Py_ERROR_SURROGATEESCAPE:
7115 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007116 but we may switch to UCS2 at the first write */
7117 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7118 goto onError;
7119 kind = writer.kind;
7120 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007121
7122 if (error_handler == _Py_ERROR_REPLACE)
7123 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7124 else
7125 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7126 writer.pos++;
7127 ++s;
7128 break;
7129
7130 case _Py_ERROR_IGNORE:
7131 ++s;
7132 break;
7133
7134 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 startinpos = s-starts;
7136 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007137 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007138 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007139 "ascii", "ordinal not in range(128)",
7140 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007141 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007142 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007143 kind = writer.kind;
7144 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007145 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007146 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007147 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007148 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007149 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007150
Benjamin Peterson29060642009-01-31 22:14:21 +00007151 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007152 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007153 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007154 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007155 return NULL;
7156}
7157
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007158/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007159PyObject *
7160PyUnicode_EncodeASCII(const Py_UNICODE *p,
7161 Py_ssize_t size,
7162 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007163{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007164 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007165 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007166 if (unicode == NULL)
7167 return NULL;
7168 result = unicode_encode_ucs1(unicode, errors, 128);
7169 Py_DECREF(unicode);
7170 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007171}
7172
Alexander Belopolsky40018472011-02-26 01:02:56 +00007173PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007174_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007175{
7176 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007177 PyErr_BadArgument();
7178 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007180 if (PyUnicode_READY(unicode) == -1)
7181 return NULL;
7182 /* Fast path: if it is an ASCII-only string, construct bytes object
7183 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007184 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007185 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7186 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007187 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007188}
7189
7190PyObject *
7191PyUnicode_AsASCIIString(PyObject *unicode)
7192{
7193 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007194}
7195
Steve Dowercc16be82016-09-08 10:35:16 -07007196#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007197
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007198/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007199
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007200#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201#define NEED_RETRY
7202#endif
7203
Steve Dower7ebdda02019-08-21 16:22:33 -07007204/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7205 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7206 both cases also and avoids partial characters overrunning the
7207 length limit in MultiByteToWideChar on Windows */
7208#define DECODING_CHUNK_SIZE (INT_MAX/4)
7209
Victor Stinner3a50e702011-10-18 21:21:00 +02007210#ifndef WC_ERR_INVALID_CHARS
7211# define WC_ERR_INVALID_CHARS 0x0080
7212#endif
7213
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007214static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007215code_page_name(UINT code_page, PyObject **obj)
7216{
7217 *obj = NULL;
7218 if (code_page == CP_ACP)
7219 return "mbcs";
7220 if (code_page == CP_UTF7)
7221 return "CP_UTF7";
7222 if (code_page == CP_UTF8)
7223 return "CP_UTF8";
7224
7225 *obj = PyBytes_FromFormat("cp%u", code_page);
7226 if (*obj == NULL)
7227 return NULL;
7228 return PyBytes_AS_STRING(*obj);
7229}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007230
Victor Stinner3a50e702011-10-18 21:21:00 +02007231static DWORD
7232decode_code_page_flags(UINT code_page)
7233{
7234 if (code_page == CP_UTF7) {
7235 /* The CP_UTF7 decoder only supports flags=0 */
7236 return 0;
7237 }
7238 else
7239 return MB_ERR_INVALID_CHARS;
7240}
7241
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007242/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 * Decode a byte string from a Windows code page into unicode object in strict
7244 * mode.
7245 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007246 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7247 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007248 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007249static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007250decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007251 wchar_t **buf,
7252 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007253 const char *in,
7254 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007255{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007256 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007257 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007259
7260 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007262 while ((outsize = MultiByteToWideChar(code_page, flags,
7263 in, insize, NULL, 0)) <= 0)
7264 {
7265 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7266 goto error;
7267 }
7268 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7269 flags = 0;
7270 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007271
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007272 /* Extend a wchar_t* buffer */
7273 Py_ssize_t n = *bufsize; /* Get the current length */
7274 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7275 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007276 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007277 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007278
7279 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007280 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7281 if (outsize <= 0)
7282 goto error;
7283 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007284
Victor Stinner3a50e702011-10-18 21:21:00 +02007285error:
7286 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7287 return -2;
7288 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007289 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007290}
7291
Victor Stinner3a50e702011-10-18 21:21:00 +02007292/*
7293 * Decode a byte string from a code page into unicode object with an error
7294 * handler.
7295 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007296 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007297 * UnicodeDecodeError exception and returns -1 on error.
7298 */
7299static int
7300decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007301 wchar_t **buf,
7302 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007303 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007304 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007305{
7306 const char *startin = in;
7307 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007308 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007309 /* Ideally, we should get reason from FormatMessage. This is the Windows
7310 2000 English version of the message. */
7311 const char *reason = "No mapping for the Unicode character exists "
7312 "in the target code page.";
7313 /* each step cannot decode more than 1 character, but a character can be
7314 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007315 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007316 int insize;
7317 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 PyObject *errorHandler = NULL;
7319 PyObject *exc = NULL;
7320 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007321 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007322 DWORD err;
7323 int ret = -1;
7324
7325 assert(size > 0);
7326
7327 encoding = code_page_name(code_page, &encoding_obj);
7328 if (encoding == NULL)
7329 return -1;
7330
Victor Stinner7d00cc12014-03-17 23:08:06 +01007331 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007332 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7333 UnicodeDecodeError. */
7334 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7335 if (exc != NULL) {
7336 PyCodec_StrictErrors(exc);
7337 Py_CLEAR(exc);
7338 }
7339 goto error;
7340 }
7341
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007342 /* Extend a wchar_t* buffer */
7343 Py_ssize_t n = *bufsize; /* Get the current length */
7344 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7345 PyErr_NoMemory();
7346 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007347 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007348 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7349 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007350 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007351 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007352
7353 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 while (in < endin)
7355 {
7356 /* Decode a character */
7357 insize = 1;
7358 do
7359 {
7360 outsize = MultiByteToWideChar(code_page, flags,
7361 in, insize,
7362 buffer, Py_ARRAY_LENGTH(buffer));
7363 if (outsize > 0)
7364 break;
7365 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007366 if (err == ERROR_INVALID_FLAGS && flags) {
7367 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7368 flags = 0;
7369 continue;
7370 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007371 if (err != ERROR_NO_UNICODE_TRANSLATION
7372 && err != ERROR_INSUFFICIENT_BUFFER)
7373 {
7374 PyErr_SetFromWindowsErr(0);
7375 goto error;
7376 }
7377 insize++;
7378 }
7379 /* 4=maximum length of a UTF-8 sequence */
7380 while (insize <= 4 && (in + insize) <= endin);
7381
7382 if (outsize <= 0) {
7383 Py_ssize_t startinpos, endinpos, outpos;
7384
Victor Stinner7d00cc12014-03-17 23:08:06 +01007385 /* last character in partial decode? */
7386 if (in + insize >= endin && !final)
7387 break;
7388
Victor Stinner3a50e702011-10-18 21:21:00 +02007389 startinpos = in - startin;
7390 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007391 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007392 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007393 errors, &errorHandler,
7394 encoding, reason,
7395 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007396 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007397 {
7398 goto error;
7399 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007400 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007401 }
7402 else {
7403 in += insize;
7404 memcpy(out, buffer, outsize * sizeof(wchar_t));
7405 out += outsize;
7406 }
7407 }
7408
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007409 /* Shrink the buffer */
7410 assert(out - *buf <= *bufsize);
7411 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007412 /* (in - startin) <= size and size is an int */
7413 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007414
7415error:
7416 Py_XDECREF(encoding_obj);
7417 Py_XDECREF(errorHandler);
7418 Py_XDECREF(exc);
7419 return ret;
7420}
7421
Victor Stinner3a50e702011-10-18 21:21:00 +02007422static PyObject *
7423decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007424 const char *s, Py_ssize_t size,
7425 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007426{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007427 wchar_t *buf = NULL;
7428 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007429 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007430
Victor Stinner3a50e702011-10-18 21:21:00 +02007431 if (code_page < 0) {
7432 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7433 return NULL;
7434 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007435 if (size < 0) {
7436 PyErr_BadInternalCall();
7437 return NULL;
7438 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007439
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007440 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007441 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442
Victor Stinner76a31a62011-11-04 00:05:13 +01007443 do
7444 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007445#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007446 if (size > DECODING_CHUNK_SIZE) {
7447 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007448 final = 0;
7449 done = 0;
7450 }
7451 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007453 {
7454 chunk_size = (int)size;
7455 final = (consumed == NULL);
7456 done = 1;
7457 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458
Victor Stinner76a31a62011-11-04 00:05:13 +01007459 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007460 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007461 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007462 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007463 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007465 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007466 s, chunk_size);
7467 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007468 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007469 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007470 errors, final);
7471 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007472
7473 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007474 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007475 return NULL;
7476 }
7477
7478 if (consumed)
7479 *consumed += converted;
7480
7481 s += converted;
7482 size -= converted;
7483 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007484
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007485 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7486 PyMem_Free(buf);
7487 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007488}
7489
Alexander Belopolsky40018472011-02-26 01:02:56 +00007490PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007491PyUnicode_DecodeCodePageStateful(int code_page,
7492 const char *s,
7493 Py_ssize_t size,
7494 const char *errors,
7495 Py_ssize_t *consumed)
7496{
7497 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7498}
7499
7500PyObject *
7501PyUnicode_DecodeMBCSStateful(const char *s,
7502 Py_ssize_t size,
7503 const char *errors,
7504 Py_ssize_t *consumed)
7505{
7506 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7507}
7508
7509PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007510PyUnicode_DecodeMBCS(const char *s,
7511 Py_ssize_t size,
7512 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007513{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007514 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7515}
7516
Victor Stinner3a50e702011-10-18 21:21:00 +02007517static DWORD
7518encode_code_page_flags(UINT code_page, const char *errors)
7519{
7520 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007521 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 }
7523 else if (code_page == CP_UTF7) {
7524 /* CP_UTF7 only supports flags=0 */
7525 return 0;
7526 }
7527 else {
7528 if (errors != NULL && strcmp(errors, "replace") == 0)
7529 return 0;
7530 else
7531 return WC_NO_BEST_FIT_CHARS;
7532 }
7533}
7534
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007535/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 * Encode a Unicode string to a Windows code page into a byte string in strict
7537 * mode.
7538 *
7539 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007540 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007541 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007542static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007543encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007544 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007546{
Victor Stinner554f3f02010-06-16 23:33:54 +00007547 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 BOOL *pusedDefaultChar = &usedDefaultChar;
7549 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007550 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007551 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007552 const DWORD flags = encode_code_page_flags(code_page, NULL);
7553 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007554 /* Create a substring so that we can get the UTF-16 representation
7555 of just the slice under consideration. */
7556 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007557
Martin v. Löwis3d325192011-11-04 18:23:06 +01007558 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007559
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007561 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007563 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007564
Victor Stinner2fc507f2011-11-04 20:06:39 +01007565 substring = PyUnicode_Substring(unicode, offset, offset+len);
7566 if (substring == NULL)
7567 return -1;
7568 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7569 if (p == NULL) {
7570 Py_DECREF(substring);
7571 return -1;
7572 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007573 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007574
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007575 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007576 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007577 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 NULL, 0,
7579 NULL, pusedDefaultChar);
7580 if (outsize <= 0)
7581 goto error;
7582 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007583 if (pusedDefaultChar && *pusedDefaultChar) {
7584 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007586 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007587
Victor Stinner3a50e702011-10-18 21:21:00 +02007588 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007589 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007591 if (*outbytes == NULL) {
7592 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007593 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007594 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007596 }
7597 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007598 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007599 const Py_ssize_t n = PyBytes_Size(*outbytes);
7600 if (outsize > PY_SSIZE_T_MAX - n) {
7601 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007602 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007603 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007604 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007605 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7606 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007607 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007608 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007610 }
7611
7612 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007613 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007614 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007615 out, outsize,
7616 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007617 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 if (outsize <= 0)
7619 goto error;
7620 if (pusedDefaultChar && *pusedDefaultChar)
7621 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007622 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007623
Victor Stinner3a50e702011-10-18 21:21:00 +02007624error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007625 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007626 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7627 return -2;
7628 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007629 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007630}
7631
Victor Stinner3a50e702011-10-18 21:21:00 +02007632/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007633 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 * error handler.
7635 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007636 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007637 * -1 on other error.
7638 */
7639static int
7640encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007641 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007642 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007643{
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007645 Py_ssize_t pos = unicode_offset;
7646 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007647 /* Ideally, we should get reason from FormatMessage. This is the Windows
7648 2000 English version of the message. */
7649 const char *reason = "invalid character";
7650 /* 4=maximum length of a UTF-8 sequence */
7651 char buffer[4];
7652 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7653 Py_ssize_t outsize;
7654 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007655 PyObject *errorHandler = NULL;
7656 PyObject *exc = NULL;
7657 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007658 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007659 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 PyObject *rep;
7661 int ret = -1;
7662
7663 assert(insize > 0);
7664
7665 encoding = code_page_name(code_page, &encoding_obj);
7666 if (encoding == NULL)
7667 return -1;
7668
7669 if (errors == NULL || strcmp(errors, "strict") == 0) {
7670 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7671 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007672 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007673 if (exc != NULL) {
7674 PyCodec_StrictErrors(exc);
7675 Py_DECREF(exc);
7676 }
7677 Py_XDECREF(encoding_obj);
7678 return -1;
7679 }
7680
7681 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7682 pusedDefaultChar = &usedDefaultChar;
7683 else
7684 pusedDefaultChar = NULL;
7685
7686 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7687 PyErr_NoMemory();
7688 goto error;
7689 }
7690 outsize = insize * Py_ARRAY_LENGTH(buffer);
7691
7692 if (*outbytes == NULL) {
7693 /* Create string object */
7694 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7695 if (*outbytes == NULL)
7696 goto error;
7697 out = PyBytes_AS_STRING(*outbytes);
7698 }
7699 else {
7700 /* Extend string object */
7701 Py_ssize_t n = PyBytes_Size(*outbytes);
7702 if (n > PY_SSIZE_T_MAX - outsize) {
7703 PyErr_NoMemory();
7704 goto error;
7705 }
7706 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7707 goto error;
7708 out = PyBytes_AS_STRING(*outbytes) + n;
7709 }
7710
7711 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007712 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007713 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007714 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7715 wchar_t chars[2];
7716 int charsize;
7717 if (ch < 0x10000) {
7718 chars[0] = (wchar_t)ch;
7719 charsize = 1;
7720 }
7721 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007722 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7723 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007724 charsize = 2;
7725 }
7726
Victor Stinner3a50e702011-10-18 21:21:00 +02007727 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007728 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007729 buffer, Py_ARRAY_LENGTH(buffer),
7730 NULL, pusedDefaultChar);
7731 if (outsize > 0) {
7732 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7733 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007734 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007735 memcpy(out, buffer, outsize);
7736 out += outsize;
7737 continue;
7738 }
7739 }
7740 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7741 PyErr_SetFromWindowsErr(0);
7742 goto error;
7743 }
7744
Victor Stinner3a50e702011-10-18 21:21:00 +02007745 rep = unicode_encode_call_errorhandler(
7746 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007747 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007748 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007749 if (rep == NULL)
7750 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007752
7753 if (PyBytes_Check(rep)) {
7754 outsize = PyBytes_GET_SIZE(rep);
7755 if (outsize != 1) {
7756 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7757 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7758 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7759 Py_DECREF(rep);
7760 goto error;
7761 }
7762 out = PyBytes_AS_STRING(*outbytes) + offset;
7763 }
7764 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7765 out += outsize;
7766 }
7767 else {
7768 Py_ssize_t i;
7769 enum PyUnicode_Kind kind;
7770 void *data;
7771
Benjamin Petersonbac79492012-01-14 13:34:47 -05007772 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007773 Py_DECREF(rep);
7774 goto error;
7775 }
7776
7777 outsize = PyUnicode_GET_LENGTH(rep);
7778 if (outsize != 1) {
7779 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7780 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7781 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7782 Py_DECREF(rep);
7783 goto error;
7784 }
7785 out = PyBytes_AS_STRING(*outbytes) + offset;
7786 }
7787 kind = PyUnicode_KIND(rep);
7788 data = PyUnicode_DATA(rep);
7789 for (i=0; i < outsize; i++) {
7790 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7791 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007792 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007793 encoding, unicode,
7794 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007795 "unable to encode error handler result to ASCII");
7796 Py_DECREF(rep);
7797 goto error;
7798 }
7799 *out = (unsigned char)ch;
7800 out++;
7801 }
7802 }
7803 Py_DECREF(rep);
7804 }
7805 /* write a NUL byte */
7806 *out = 0;
7807 outsize = out - PyBytes_AS_STRING(*outbytes);
7808 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7809 if (_PyBytes_Resize(outbytes, outsize) < 0)
7810 goto error;
7811 ret = 0;
7812
7813error:
7814 Py_XDECREF(encoding_obj);
7815 Py_XDECREF(errorHandler);
7816 Py_XDECREF(exc);
7817 return ret;
7818}
7819
Victor Stinner3a50e702011-10-18 21:21:00 +02007820static PyObject *
7821encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007822 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007823 const char *errors)
7824{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007825 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007826 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007827 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007828 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007829
Victor Stinner29dacf22015-01-26 16:41:32 +01007830 if (!PyUnicode_Check(unicode)) {
7831 PyErr_BadArgument();
7832 return NULL;
7833 }
7834
Benjamin Petersonbac79492012-01-14 13:34:47 -05007835 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007836 return NULL;
7837 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007838
Victor Stinner3a50e702011-10-18 21:21:00 +02007839 if (code_page < 0) {
7840 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7841 return NULL;
7842 }
7843
Martin v. Löwis3d325192011-11-04 18:23:06 +01007844 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007845 return PyBytes_FromStringAndSize(NULL, 0);
7846
Victor Stinner7581cef2011-11-03 22:32:33 +01007847 offset = 0;
7848 do
7849 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007850#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007851 if (len > DECODING_CHUNK_SIZE) {
7852 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007853 done = 0;
7854 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007855 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007856#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007857 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007858 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007859 done = 1;
7860 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007861
Victor Stinner76a31a62011-11-04 00:05:13 +01007862 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007863 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007864 errors);
7865 if (ret == -2)
7866 ret = encode_code_page_errors(code_page, &outbytes,
7867 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007868 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007869 if (ret < 0) {
7870 Py_XDECREF(outbytes);
7871 return NULL;
7872 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007873
Victor Stinner7581cef2011-11-03 22:32:33 +01007874 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007875 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007876 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007877
Victor Stinner3a50e702011-10-18 21:21:00 +02007878 return outbytes;
7879}
7880
7881PyObject *
7882PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7883 Py_ssize_t size,
7884 const char *errors)
7885{
Victor Stinner7581cef2011-11-03 22:32:33 +01007886 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007887 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007888 if (unicode == NULL)
7889 return NULL;
7890 res = encode_code_page(CP_ACP, unicode, errors);
7891 Py_DECREF(unicode);
7892 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007893}
7894
7895PyObject *
7896PyUnicode_EncodeCodePage(int code_page,
7897 PyObject *unicode,
7898 const char *errors)
7899{
Victor Stinner7581cef2011-11-03 22:32:33 +01007900 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007901}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007902
Alexander Belopolsky40018472011-02-26 01:02:56 +00007903PyObject *
7904PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007905{
Victor Stinner7581cef2011-11-03 22:32:33 +01007906 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007907}
7908
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007909#undef NEED_RETRY
7910
Steve Dowercc16be82016-09-08 10:35:16 -07007911#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007912
Guido van Rossumd57fd912000-03-10 22:53:23 +00007913/* --- Character Mapping Codec -------------------------------------------- */
7914
Victor Stinnerfb161b12013-04-18 01:44:27 +02007915static int
7916charmap_decode_string(const char *s,
7917 Py_ssize_t size,
7918 PyObject *mapping,
7919 const char *errors,
7920 _PyUnicodeWriter *writer)
7921{
7922 const char *starts = s;
7923 const char *e;
7924 Py_ssize_t startinpos, endinpos;
7925 PyObject *errorHandler = NULL, *exc = NULL;
7926 Py_ssize_t maplen;
7927 enum PyUnicode_Kind mapkind;
7928 void *mapdata;
7929 Py_UCS4 x;
7930 unsigned char ch;
7931
7932 if (PyUnicode_READY(mapping) == -1)
7933 return -1;
7934
7935 maplen = PyUnicode_GET_LENGTH(mapping);
7936 mapdata = PyUnicode_DATA(mapping);
7937 mapkind = PyUnicode_KIND(mapping);
7938
7939 e = s + size;
7940
7941 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7942 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7943 * is disabled in encoding aliases, latin1 is preferred because
7944 * its implementation is faster. */
7945 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7946 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7947 Py_UCS4 maxchar = writer->maxchar;
7948
7949 assert (writer->kind == PyUnicode_1BYTE_KIND);
7950 while (s < e) {
7951 ch = *s;
7952 x = mapdata_ucs1[ch];
7953 if (x > maxchar) {
7954 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7955 goto onError;
7956 maxchar = writer->maxchar;
7957 outdata = (Py_UCS1 *)writer->data;
7958 }
7959 outdata[writer->pos] = x;
7960 writer->pos++;
7961 ++s;
7962 }
7963 return 0;
7964 }
7965
7966 while (s < e) {
7967 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7968 enum PyUnicode_Kind outkind = writer->kind;
7969 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7970 if (outkind == PyUnicode_1BYTE_KIND) {
7971 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7972 Py_UCS4 maxchar = writer->maxchar;
7973 while (s < e) {
7974 ch = *s;
7975 x = mapdata_ucs2[ch];
7976 if (x > maxchar)
7977 goto Error;
7978 outdata[writer->pos] = x;
7979 writer->pos++;
7980 ++s;
7981 }
7982 break;
7983 }
7984 else if (outkind == PyUnicode_2BYTE_KIND) {
7985 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7986 while (s < e) {
7987 ch = *s;
7988 x = mapdata_ucs2[ch];
7989 if (x == 0xFFFE)
7990 goto Error;
7991 outdata[writer->pos] = x;
7992 writer->pos++;
7993 ++s;
7994 }
7995 break;
7996 }
7997 }
7998 ch = *s;
7999
8000 if (ch < maplen)
8001 x = PyUnicode_READ(mapkind, mapdata, ch);
8002 else
8003 x = 0xfffe; /* invalid value */
8004Error:
8005 if (x == 0xfffe)
8006 {
8007 /* undefined mapping */
8008 startinpos = s-starts;
8009 endinpos = startinpos+1;
8010 if (unicode_decode_call_errorhandler_writer(
8011 errors, &errorHandler,
8012 "charmap", "character maps to <undefined>",
8013 &starts, &e, &startinpos, &endinpos, &exc, &s,
8014 writer)) {
8015 goto onError;
8016 }
8017 continue;
8018 }
8019
8020 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8021 goto onError;
8022 ++s;
8023 }
8024 Py_XDECREF(errorHandler);
8025 Py_XDECREF(exc);
8026 return 0;
8027
8028onError:
8029 Py_XDECREF(errorHandler);
8030 Py_XDECREF(exc);
8031 return -1;
8032}
8033
8034static int
8035charmap_decode_mapping(const char *s,
8036 Py_ssize_t size,
8037 PyObject *mapping,
8038 const char *errors,
8039 _PyUnicodeWriter *writer)
8040{
8041 const char *starts = s;
8042 const char *e;
8043 Py_ssize_t startinpos, endinpos;
8044 PyObject *errorHandler = NULL, *exc = NULL;
8045 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008046 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008047
8048 e = s + size;
8049
8050 while (s < e) {
8051 ch = *s;
8052
8053 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8054 key = PyLong_FromLong((long)ch);
8055 if (key == NULL)
8056 goto onError;
8057
8058 item = PyObject_GetItem(mapping, key);
8059 Py_DECREF(key);
8060 if (item == NULL) {
8061 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8062 /* No mapping found means: mapping is undefined. */
8063 PyErr_Clear();
8064 goto Undefined;
8065 } else
8066 goto onError;
8067 }
8068
8069 /* Apply mapping */
8070 if (item == Py_None)
8071 goto Undefined;
8072 if (PyLong_Check(item)) {
8073 long value = PyLong_AS_LONG(item);
8074 if (value == 0xFFFE)
8075 goto Undefined;
8076 if (value < 0 || value > MAX_UNICODE) {
8077 PyErr_Format(PyExc_TypeError,
8078 "character mapping must be in range(0x%lx)",
8079 (unsigned long)MAX_UNICODE + 1);
8080 goto onError;
8081 }
8082
8083 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8084 goto onError;
8085 }
8086 else if (PyUnicode_Check(item)) {
8087 if (PyUnicode_READY(item) == -1)
8088 goto onError;
8089 if (PyUnicode_GET_LENGTH(item) == 1) {
8090 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8091 if (value == 0xFFFE)
8092 goto Undefined;
8093 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8094 goto onError;
8095 }
8096 else {
8097 writer->overallocate = 1;
8098 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8099 goto onError;
8100 }
8101 }
8102 else {
8103 /* wrong return value */
8104 PyErr_SetString(PyExc_TypeError,
8105 "character mapping must return integer, None or str");
8106 goto onError;
8107 }
8108 Py_CLEAR(item);
8109 ++s;
8110 continue;
8111
8112Undefined:
8113 /* undefined mapping */
8114 Py_CLEAR(item);
8115 startinpos = s-starts;
8116 endinpos = startinpos+1;
8117 if (unicode_decode_call_errorhandler_writer(
8118 errors, &errorHandler,
8119 "charmap", "character maps to <undefined>",
8120 &starts, &e, &startinpos, &endinpos, &exc, &s,
8121 writer)) {
8122 goto onError;
8123 }
8124 }
8125 Py_XDECREF(errorHandler);
8126 Py_XDECREF(exc);
8127 return 0;
8128
8129onError:
8130 Py_XDECREF(item);
8131 Py_XDECREF(errorHandler);
8132 Py_XDECREF(exc);
8133 return -1;
8134}
8135
Alexander Belopolsky40018472011-02-26 01:02:56 +00008136PyObject *
8137PyUnicode_DecodeCharmap(const char *s,
8138 Py_ssize_t size,
8139 PyObject *mapping,
8140 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008141{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008142 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008143
Guido van Rossumd57fd912000-03-10 22:53:23 +00008144 /* Default to Latin-1 */
8145 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008146 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008147
Guido van Rossumd57fd912000-03-10 22:53:23 +00008148 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008149 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008150 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008151 writer.min_length = size;
8152 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008153 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008154
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008155 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008156 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8157 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008158 }
8159 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008160 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8161 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008162 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008163 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008164
Benjamin Peterson29060642009-01-31 22:14:21 +00008165 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008166 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008167 return NULL;
8168}
8169
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170/* Charmap encoding: the lookup table */
8171
Alexander Belopolsky40018472011-02-26 01:02:56 +00008172struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008173 PyObject_HEAD
8174 unsigned char level1[32];
8175 int count2, count3;
8176 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177};
8178
8179static PyObject*
8180encoding_map_size(PyObject *obj, PyObject* args)
8181{
8182 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008183 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008184 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185}
8186
8187static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008188 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008189 PyDoc_STR("Return the size (in bytes) of this object") },
8190 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008191};
8192
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008194 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008195 "EncodingMap", /*tp_name*/
8196 sizeof(struct encoding_map), /*tp_basicsize*/
8197 0, /*tp_itemsize*/
8198 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008199 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008200 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008201 0, /*tp_getattr*/
8202 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008203 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008204 0, /*tp_repr*/
8205 0, /*tp_as_number*/
8206 0, /*tp_as_sequence*/
8207 0, /*tp_as_mapping*/
8208 0, /*tp_hash*/
8209 0, /*tp_call*/
8210 0, /*tp_str*/
8211 0, /*tp_getattro*/
8212 0, /*tp_setattro*/
8213 0, /*tp_as_buffer*/
8214 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8215 0, /*tp_doc*/
8216 0, /*tp_traverse*/
8217 0, /*tp_clear*/
8218 0, /*tp_richcompare*/
8219 0, /*tp_weaklistoffset*/
8220 0, /*tp_iter*/
8221 0, /*tp_iternext*/
8222 encoding_map_methods, /*tp_methods*/
8223 0, /*tp_members*/
8224 0, /*tp_getset*/
8225 0, /*tp_base*/
8226 0, /*tp_dict*/
8227 0, /*tp_descr_get*/
8228 0, /*tp_descr_set*/
8229 0, /*tp_dictoffset*/
8230 0, /*tp_init*/
8231 0, /*tp_alloc*/
8232 0, /*tp_new*/
8233 0, /*tp_free*/
8234 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008235};
8236
8237PyObject*
8238PyUnicode_BuildEncodingMap(PyObject* string)
8239{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008240 PyObject *result;
8241 struct encoding_map *mresult;
8242 int i;
8243 int need_dict = 0;
8244 unsigned char level1[32];
8245 unsigned char level2[512];
8246 unsigned char *mlevel1, *mlevel2, *mlevel3;
8247 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008248 int kind;
8249 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008250 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008251 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008252
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008253 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008254 PyErr_BadArgument();
8255 return NULL;
8256 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008257 kind = PyUnicode_KIND(string);
8258 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008259 length = PyUnicode_GET_LENGTH(string);
8260 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008261 memset(level1, 0xFF, sizeof level1);
8262 memset(level2, 0xFF, sizeof level2);
8263
8264 /* If there isn't a one-to-one mapping of NULL to \0,
8265 or if there are non-BMP characters, we need to use
8266 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008267 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008268 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008269 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008270 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008271 ch = PyUnicode_READ(kind, data, i);
8272 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273 need_dict = 1;
8274 break;
8275 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008276 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008277 /* unmapped character */
8278 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008279 l1 = ch >> 11;
8280 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281 if (level1[l1] == 0xFF)
8282 level1[l1] = count2++;
8283 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008284 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285 }
8286
8287 if (count2 >= 0xFF || count3 >= 0xFF)
8288 need_dict = 1;
8289
8290 if (need_dict) {
8291 PyObject *result = PyDict_New();
8292 PyObject *key, *value;
8293 if (!result)
8294 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008295 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008296 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008297 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008298 if (!key || !value)
8299 goto failed1;
8300 if (PyDict_SetItem(result, key, value) == -1)
8301 goto failed1;
8302 Py_DECREF(key);
8303 Py_DECREF(value);
8304 }
8305 return result;
8306 failed1:
8307 Py_XDECREF(key);
8308 Py_XDECREF(value);
8309 Py_DECREF(result);
8310 return NULL;
8311 }
8312
8313 /* Create a three-level trie */
8314 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8315 16*count2 + 128*count3 - 1);
8316 if (!result)
8317 return PyErr_NoMemory();
8318 PyObject_Init(result, &EncodingMapType);
8319 mresult = (struct encoding_map*)result;
8320 mresult->count2 = count2;
8321 mresult->count3 = count3;
8322 mlevel1 = mresult->level1;
8323 mlevel2 = mresult->level23;
8324 mlevel3 = mresult->level23 + 16*count2;
8325 memcpy(mlevel1, level1, 32);
8326 memset(mlevel2, 0xFF, 16*count2);
8327 memset(mlevel3, 0, 128*count3);
8328 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008329 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008330 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008331 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8332 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008333 /* unmapped character */
8334 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008335 o1 = ch>>11;
8336 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008337 i2 = 16*mlevel1[o1] + o2;
8338 if (mlevel2[i2] == 0xFF)
8339 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008340 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008341 i3 = 128*mlevel2[i2] + o3;
8342 mlevel3[i3] = i;
8343 }
8344 return result;
8345}
8346
8347static int
Victor Stinner22168992011-11-20 17:09:18 +01008348encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008349{
8350 struct encoding_map *map = (struct encoding_map*)mapping;
8351 int l1 = c>>11;
8352 int l2 = (c>>7) & 0xF;
8353 int l3 = c & 0x7F;
8354 int i;
8355
Victor Stinner22168992011-11-20 17:09:18 +01008356 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008357 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008358 if (c == 0)
8359 return 0;
8360 /* level 1*/
8361 i = map->level1[l1];
8362 if (i == 0xFF) {
8363 return -1;
8364 }
8365 /* level 2*/
8366 i = map->level23[16*i+l2];
8367 if (i == 0xFF) {
8368 return -1;
8369 }
8370 /* level 3 */
8371 i = map->level23[16*map->count2 + 128*i + l3];
8372 if (i == 0) {
8373 return -1;
8374 }
8375 return i;
8376}
8377
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008378/* Lookup the character ch in the mapping. If the character
8379 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008380 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008381static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008382charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008383{
Christian Heimes217cfd12007-12-02 14:31:20 +00008384 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008385 PyObject *x;
8386
8387 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008388 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 x = PyObject_GetItem(mapping, w);
8390 Py_DECREF(w);
8391 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8393 /* No mapping found means: mapping is undefined. */
8394 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008395 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008396 } else
8397 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008398 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008399 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008400 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008401 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008402 long value = PyLong_AS_LONG(x);
8403 if (value < 0 || value > 255) {
8404 PyErr_SetString(PyExc_TypeError,
8405 "character mapping must be in range(256)");
8406 Py_DECREF(x);
8407 return NULL;
8408 }
8409 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008410 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008411 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008412 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008413 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008414 /* wrong return value */
8415 PyErr_Format(PyExc_TypeError,
8416 "character mapping must return integer, bytes or None, not %.400s",
8417 x->ob_type->tp_name);
8418 Py_DECREF(x);
8419 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008420 }
8421}
8422
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008423static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008424charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008425{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008426 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8427 /* exponentially overallocate to minimize reallocations */
8428 if (requiredsize < 2*outsize)
8429 requiredsize = 2*outsize;
8430 if (_PyBytes_Resize(outobj, requiredsize))
8431 return -1;
8432 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008433}
8434
Benjamin Peterson14339b62009-01-31 16:36:08 +00008435typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008436 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008437} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008439 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008440 space is available. Return a new reference to the object that
8441 was put in the output buffer, or Py_None, if the mapping was undefined
8442 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008443 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008444static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008445charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008446 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008448 PyObject *rep;
8449 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008450 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451
Christian Heimes90aa7642007-12-19 02:45:37 +00008452 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008453 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008454 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008455 if (res == -1)
8456 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 if (outsize<requiredsize)
8458 if (charmapencode_resize(outobj, outpos, requiredsize))
8459 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008460 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008461 outstart[(*outpos)++] = (char)res;
8462 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008463 }
8464
8465 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008466 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008467 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008468 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 Py_DECREF(rep);
8470 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008471 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 if (PyLong_Check(rep)) {
8473 Py_ssize_t requiredsize = *outpos+1;
8474 if (outsize<requiredsize)
8475 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8476 Py_DECREF(rep);
8477 return enc_EXCEPTION;
8478 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008479 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008481 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 else {
8483 const char *repchars = PyBytes_AS_STRING(rep);
8484 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8485 Py_ssize_t requiredsize = *outpos+repsize;
8486 if (outsize<requiredsize)
8487 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8488 Py_DECREF(rep);
8489 return enc_EXCEPTION;
8490 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008491 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 memcpy(outstart + *outpos, repchars, repsize);
8493 *outpos += repsize;
8494 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008495 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008496 Py_DECREF(rep);
8497 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008498}
8499
8500/* handle an error in PyUnicode_EncodeCharmap
8501 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008502static int
8503charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008504 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008505 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008506 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008507 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008508{
8509 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008510 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008511 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008512 enum PyUnicode_Kind kind;
8513 void *data;
8514 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008515 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008516 Py_ssize_t collstartpos = *inpos;
8517 Py_ssize_t collendpos = *inpos+1;
8518 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008519 const char *encoding = "charmap";
8520 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008521 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008522 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008523 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008524
Benjamin Petersonbac79492012-01-14 13:34:47 -05008525 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008526 return -1;
8527 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 /* find all unencodable characters */
8529 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008530 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008531 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008532 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008533 val = encoding_map_lookup(ch, mapping);
8534 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008535 break;
8536 ++collendpos;
8537 continue;
8538 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008539
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008540 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8541 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008542 if (rep==NULL)
8543 return -1;
8544 else if (rep!=Py_None) {
8545 Py_DECREF(rep);
8546 break;
8547 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008548 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008550 }
8551 /* cache callback name lookup
8552 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008553 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008554 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008555
8556 switch (*error_handler) {
8557 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008558 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008559 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008560
8561 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008562 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008563 x = charmapencode_output('?', mapping, res, respos);
8564 if (x==enc_EXCEPTION) {
8565 return -1;
8566 }
8567 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008568 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008569 return -1;
8570 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008571 }
8572 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008573 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008574 *inpos = collendpos;
8575 break;
Victor Stinner50149202015-09-22 00:26:54 +02008576
8577 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008578 /* generate replacement (temporarily (mis)uses p) */
8579 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008580 char buffer[2+29+1+1];
8581 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008582 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008583 for (cp = buffer; *cp; ++cp) {
8584 x = charmapencode_output(*cp, mapping, res, respos);
8585 if (x==enc_EXCEPTION)
8586 return -1;
8587 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008588 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008589 return -1;
8590 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008591 }
8592 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008593 *inpos = collendpos;
8594 break;
Victor Stinner50149202015-09-22 00:26:54 +02008595
Benjamin Peterson14339b62009-01-31 16:36:08 +00008596 default:
Victor Stinner50149202015-09-22 00:26:54 +02008597 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008598 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008600 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008602 if (PyBytes_Check(repunicode)) {
8603 /* Directly copy bytes result to output. */
8604 Py_ssize_t outsize = PyBytes_Size(*res);
8605 Py_ssize_t requiredsize;
8606 repsize = PyBytes_Size(repunicode);
8607 requiredsize = *respos + repsize;
8608 if (requiredsize > outsize)
8609 /* Make room for all additional bytes. */
8610 if (charmapencode_resize(res, respos, requiredsize)) {
8611 Py_DECREF(repunicode);
8612 return -1;
8613 }
8614 memcpy(PyBytes_AsString(*res) + *respos,
8615 PyBytes_AsString(repunicode), repsize);
8616 *respos += repsize;
8617 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008618 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008619 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008620 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008621 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008622 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008623 Py_DECREF(repunicode);
8624 return -1;
8625 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008626 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008627 data = PyUnicode_DATA(repunicode);
8628 kind = PyUnicode_KIND(repunicode);
8629 for (index = 0; index < repsize; index++) {
8630 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8631 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008633 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 return -1;
8635 }
8636 else if (x==enc_FAILED) {
8637 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008638 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008639 return -1;
8640 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008641 }
8642 *inpos = newpos;
8643 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008644 }
8645 return 0;
8646}
8647
Alexander Belopolsky40018472011-02-26 01:02:56 +00008648PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008649_PyUnicode_EncodeCharmap(PyObject *unicode,
8650 PyObject *mapping,
8651 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008652{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008653 /* output object */
8654 PyObject *res = NULL;
8655 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008656 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008657 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008658 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008659 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008660 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008662 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008663 void *data;
8664 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665
Benjamin Petersonbac79492012-01-14 13:34:47 -05008666 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008667 return NULL;
8668 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008669 data = PyUnicode_DATA(unicode);
8670 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008671
Guido van Rossumd57fd912000-03-10 22:53:23 +00008672 /* Default to Latin-1 */
8673 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008674 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 /* allocate enough for a simple encoding without
8677 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008678 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679 if (res == NULL)
8680 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008681 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008682 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008683
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008684 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008685 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008686 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008687 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008688 if (x==enc_EXCEPTION) /* error */
8689 goto onError;
8690 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008691 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008692 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008693 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008694 &res, &respos)) {
8695 goto onError;
8696 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008697 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 else
8699 /* done with this character => adjust input position */
8700 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008701 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008702
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008703 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008704 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008705 if (_PyBytes_Resize(&res, respos) < 0)
8706 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008707
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008709 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 return res;
8711
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713 Py_XDECREF(res);
8714 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008715 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716 return NULL;
8717}
8718
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008719/* Deprecated */
8720PyObject *
8721PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8722 Py_ssize_t size,
8723 PyObject *mapping,
8724 const char *errors)
8725{
8726 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008727 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008728 if (unicode == NULL)
8729 return NULL;
8730 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8731 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008732 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008733}
8734
Alexander Belopolsky40018472011-02-26 01:02:56 +00008735PyObject *
8736PyUnicode_AsCharmapString(PyObject *unicode,
8737 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008738{
8739 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008740 PyErr_BadArgument();
8741 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008742 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008743 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008744}
8745
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008747static void
8748make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008749 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008750 Py_ssize_t startpos, Py_ssize_t endpos,
8751 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008752{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008753 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754 *exceptionObject = _PyUnicodeTranslateError_Create(
8755 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008756 }
8757 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8759 goto onError;
8760 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8761 goto onError;
8762 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8763 goto onError;
8764 return;
8765 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008766 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008767 }
8768}
8769
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008770/* error handling callback helper:
8771 build arguments, call the callback and check the arguments,
8772 put the result into newpos and return the replacement string, which
8773 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008774static PyObject *
8775unicode_translate_call_errorhandler(const char *errors,
8776 PyObject **errorHandler,
8777 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008778 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008779 Py_ssize_t startpos, Py_ssize_t endpos,
8780 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008781{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008782 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008783
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008784 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785 PyObject *restuple;
8786 PyObject *resunicode;
8787
8788 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008789 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008790 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008792 }
8793
8794 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008795 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008796 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008798
Jeroen Demeyer196a5302019-07-04 12:31:34 +02008799 restuple = _PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008800 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008802 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008803 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 Py_DECREF(restuple);
8805 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008806 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008807 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 &resunicode, &i_newpos)) {
8809 Py_DECREF(restuple);
8810 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008812 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008813 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008814 else
8815 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008817 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008818 Py_DECREF(restuple);
8819 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008820 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008821 Py_INCREF(resunicode);
8822 Py_DECREF(restuple);
8823 return resunicode;
8824}
8825
8826/* Lookup the character ch in the mapping and put the result in result,
8827 which must be decrefed by the caller.
8828 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008829static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008830charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008831{
Christian Heimes217cfd12007-12-02 14:31:20 +00008832 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008833 PyObject *x;
8834
8835 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008836 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008837 x = PyObject_GetItem(mapping, w);
8838 Py_DECREF(w);
8839 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8841 /* No mapping found means: use 1:1 mapping. */
8842 PyErr_Clear();
8843 *result = NULL;
8844 return 0;
8845 } else
8846 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008847 }
8848 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008849 *result = x;
8850 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008851 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008852 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008853 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008854 if (value < 0 || value > MAX_UNICODE) {
8855 PyErr_Format(PyExc_ValueError,
8856 "character mapping must be in range(0x%x)",
8857 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008858 Py_DECREF(x);
8859 return -1;
8860 }
8861 *result = x;
8862 return 0;
8863 }
8864 else if (PyUnicode_Check(x)) {
8865 *result = x;
8866 return 0;
8867 }
8868 else {
8869 /* wrong return value */
8870 PyErr_SetString(PyExc_TypeError,
8871 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008872 Py_DECREF(x);
8873 return -1;
8874 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008875}
Victor Stinner1194ea02014-04-04 19:37:40 +02008876
8877/* lookup the character, write the result into the writer.
8878 Return 1 if the result was written into the writer, return 0 if the mapping
8879 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008880static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008881charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8882 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008883{
Victor Stinner1194ea02014-04-04 19:37:40 +02008884 PyObject *item;
8885
8886 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008887 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008888
8889 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008890 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008891 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008892 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008893 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008894 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008895 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008896
8897 if (item == Py_None) {
8898 Py_DECREF(item);
8899 return 0;
8900 }
8901
8902 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008903 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8904 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8905 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008906 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8907 Py_DECREF(item);
8908 return -1;
8909 }
8910 Py_DECREF(item);
8911 return 1;
8912 }
8913
8914 if (!PyUnicode_Check(item)) {
8915 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008916 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008917 }
8918
8919 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8920 Py_DECREF(item);
8921 return -1;
8922 }
8923
8924 Py_DECREF(item);
8925 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008926}
8927
Victor Stinner89a76ab2014-04-05 11:44:04 +02008928static int
8929unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8930 Py_UCS1 *translate)
8931{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008932 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008933 int ret = 0;
8934
Victor Stinner89a76ab2014-04-05 11:44:04 +02008935 if (charmaptranslate_lookup(ch, mapping, &item)) {
8936 return -1;
8937 }
8938
8939 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008940 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008941 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008942 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008943 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008944 /* not found => default to 1:1 mapping */
8945 translate[ch] = ch;
8946 return 1;
8947 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008948 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008949 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008950 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8951 used it */
8952 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008953 /* invalid character or character outside ASCII:
8954 skip the fast translate */
8955 goto exit;
8956 }
8957 translate[ch] = (Py_UCS1)replace;
8958 }
8959 else if (PyUnicode_Check(item)) {
8960 Py_UCS4 replace;
8961
8962 if (PyUnicode_READY(item) == -1) {
8963 Py_DECREF(item);
8964 return -1;
8965 }
8966 if (PyUnicode_GET_LENGTH(item) != 1)
8967 goto exit;
8968
8969 replace = PyUnicode_READ_CHAR(item, 0);
8970 if (replace > 127)
8971 goto exit;
8972 translate[ch] = (Py_UCS1)replace;
8973 }
8974 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008975 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008976 goto exit;
8977 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978 ret = 1;
8979
Benjamin Peterson1365de72014-04-07 20:15:41 -04008980 exit:
8981 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008982 return ret;
8983}
8984
8985/* Fast path for ascii => ascii translation. Return 1 if the whole string
8986 was translated into writer, return 0 if the input string was partially
8987 translated into writer, raise an exception and return -1 on error. */
8988static int
8989unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008990 _PyUnicodeWriter *writer, int ignore,
8991 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008992{
Victor Stinner872b2912014-04-05 14:27:07 +02008993 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008994 Py_ssize_t len;
8995 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008996 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008997
Victor Stinner89a76ab2014-04-05 11:44:04 +02008998 len = PyUnicode_GET_LENGTH(input);
8999
Victor Stinner872b2912014-04-05 14:27:07 +02009000 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009001
9002 in = PyUnicode_1BYTE_DATA(input);
9003 end = in + len;
9004
9005 assert(PyUnicode_IS_ASCII(writer->buffer));
9006 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9007 out = PyUnicode_1BYTE_DATA(writer->buffer);
9008
Victor Stinner872b2912014-04-05 14:27:07 +02009009 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009010 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009011 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009012 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009013 int translate = unicode_fast_translate_lookup(mapping, ch,
9014 ascii_table);
9015 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009016 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009017 if (translate == 0)
9018 goto exit;
9019 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009020 }
Victor Stinner872b2912014-04-05 14:27:07 +02009021 if (ch2 == 0xfe) {
9022 if (ignore)
9023 continue;
9024 goto exit;
9025 }
9026 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009027 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009028 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009029 }
Victor Stinner872b2912014-04-05 14:27:07 +02009030 res = 1;
9031
9032exit:
9033 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009034 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009035 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009036}
9037
Victor Stinner3222da22015-10-01 22:07:32 +02009038static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009039_PyUnicode_TranslateCharmap(PyObject *input,
9040 PyObject *mapping,
9041 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009042{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009043 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009044 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009045 Py_ssize_t size, i;
9046 int kind;
9047 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009048 _PyUnicodeWriter writer;
9049 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009050 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009051 PyObject *errorHandler = NULL;
9052 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009053 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009054 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009055
Guido van Rossumd57fd912000-03-10 22:53:23 +00009056 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009057 PyErr_BadArgument();
9058 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009059 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009060
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009061 if (PyUnicode_READY(input) == -1)
9062 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009063 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009064 kind = PyUnicode_KIND(input);
9065 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009066
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009067 if (size == 0)
9068 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009069
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009070 /* allocate enough for a simple 1:1 translation without
9071 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009072 _PyUnicodeWriter_Init(&writer);
9073 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009074 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075
Victor Stinner872b2912014-04-05 14:27:07 +02009076 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9077
Victor Stinner33798672016-03-01 21:59:58 +01009078 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009079 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009080 if (PyUnicode_IS_ASCII(input)) {
9081 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9082 if (res < 0) {
9083 _PyUnicodeWriter_Dealloc(&writer);
9084 return NULL;
9085 }
9086 if (res == 1)
9087 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009088 }
Victor Stinner33798672016-03-01 21:59:58 +01009089 else {
9090 i = 0;
9091 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009093 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009094 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009095 int translate;
9096 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9097 Py_ssize_t newpos;
9098 /* startpos for collecting untranslatable chars */
9099 Py_ssize_t collstart;
9100 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009101 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102
Victor Stinner1194ea02014-04-04 19:37:40 +02009103 ch = PyUnicode_READ(kind, data, i);
9104 translate = charmaptranslate_output(ch, mapping, &writer);
9105 if (translate < 0)
9106 goto onError;
9107
9108 if (translate != 0) {
9109 /* it worked => adjust input pointer */
9110 ++i;
9111 continue;
9112 }
9113
9114 /* untranslatable character */
9115 collstart = i;
9116 collend = i+1;
9117
9118 /* find all untranslatable characters */
9119 while (collend < size) {
9120 PyObject *x;
9121 ch = PyUnicode_READ(kind, data, collend);
9122 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009123 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009124 Py_XDECREF(x);
9125 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009126 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009127 ++collend;
9128 }
9129
9130 if (ignore) {
9131 i = collend;
9132 }
9133 else {
9134 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9135 reason, input, &exc,
9136 collstart, collend, &newpos);
9137 if (repunicode == NULL)
9138 goto onError;
9139 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009140 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009141 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009142 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009143 Py_DECREF(repunicode);
9144 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009145 }
9146 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009147 Py_XDECREF(exc);
9148 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009149 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009150
Benjamin Peterson29060642009-01-31 22:14:21 +00009151 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009152 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009153 Py_XDECREF(exc);
9154 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009155 return NULL;
9156}
9157
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009158/* Deprecated. Use PyUnicode_Translate instead. */
9159PyObject *
9160PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9161 Py_ssize_t size,
9162 PyObject *mapping,
9163 const char *errors)
9164{
Christian Heimes5f520f42012-09-11 14:03:25 +02009165 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009166 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009167 if (!unicode)
9168 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009169 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9170 Py_DECREF(unicode);
9171 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009172}
9173
Alexander Belopolsky40018472011-02-26 01:02:56 +00009174PyObject *
9175PyUnicode_Translate(PyObject *str,
9176 PyObject *mapping,
9177 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009178{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009179 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009180 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009181 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009182}
Tim Petersced69f82003-09-16 20:30:58 +00009183
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009184PyObject *
9185_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9186{
9187 if (!PyUnicode_Check(unicode)) {
9188 PyErr_BadInternalCall();
9189 return NULL;
9190 }
9191 if (PyUnicode_READY(unicode) == -1)
9192 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009193 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009194 /* If the string is already ASCII, just return the same string */
9195 Py_INCREF(unicode);
9196 return unicode;
9197 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009198
9199 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9200 PyObject *result = PyUnicode_New(len, 127);
9201 if (result == NULL) {
9202 return NULL;
9203 }
9204
9205 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9206 int kind = PyUnicode_KIND(unicode);
9207 const void *data = PyUnicode_DATA(unicode);
9208 Py_ssize_t i;
9209 for (i = 0; i < len; ++i) {
9210 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9211 if (ch < 127) {
9212 out[i] = ch;
9213 }
9214 else if (Py_UNICODE_ISSPACE(ch)) {
9215 out[i] = ' ';
9216 }
9217 else {
9218 int decimal = Py_UNICODE_TODECIMAL(ch);
9219 if (decimal < 0) {
9220 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009221 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009222 _PyUnicode_LENGTH(result) = i + 1;
9223 break;
9224 }
9225 out[i] = '0' + decimal;
9226 }
9227 }
9228
INADA Naoki16dfca42018-07-14 12:06:43 +09009229 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009230 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009231}
9232
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009233PyObject *
9234PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9235 Py_ssize_t length)
9236{
Victor Stinnerf0124502011-11-21 23:12:56 +01009237 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009238 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009239 Py_UCS4 maxchar;
9240 enum PyUnicode_Kind kind;
9241 void *data;
9242
Victor Stinner99d7ad02012-02-22 13:37:39 +01009243 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009244 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009245 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009246 if (ch > 127) {
9247 int decimal = Py_UNICODE_TODECIMAL(ch);
9248 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009249 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009250 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009251 }
9252 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009253
9254 /* Copy to a new string */
9255 decimal = PyUnicode_New(length, maxchar);
9256 if (decimal == NULL)
9257 return decimal;
9258 kind = PyUnicode_KIND(decimal);
9259 data = PyUnicode_DATA(decimal);
9260 /* Iterate over code points */
9261 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009262 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009263 if (ch > 127) {
9264 int decimal = Py_UNICODE_TODECIMAL(ch);
9265 if (decimal >= 0)
9266 ch = '0' + decimal;
9267 }
9268 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009270 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009271}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009272/* --- Decimal Encoder ---------------------------------------------------- */
9273
Alexander Belopolsky40018472011-02-26 01:02:56 +00009274int
9275PyUnicode_EncodeDecimal(Py_UNICODE *s,
9276 Py_ssize_t length,
9277 char *output,
9278 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009279{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009280 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009281 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009282 enum PyUnicode_Kind kind;
9283 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009284
9285 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009286 PyErr_BadArgument();
9287 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009288 }
9289
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009290 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009291 if (unicode == NULL)
9292 return -1;
9293
Victor Stinner42bf7752011-11-21 22:52:58 +01009294 kind = PyUnicode_KIND(unicode);
9295 data = PyUnicode_DATA(unicode);
9296
Victor Stinnerb84d7232011-11-22 01:50:07 +01009297 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009298 PyObject *exc;
9299 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009300 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009301 Py_ssize_t startpos;
9302
9303 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009304
Benjamin Peterson29060642009-01-31 22:14:21 +00009305 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009306 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009307 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009308 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009309 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009310 decimal = Py_UNICODE_TODECIMAL(ch);
9311 if (decimal >= 0) {
9312 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009313 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009314 continue;
9315 }
9316 if (0 < ch && ch < 256) {
9317 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009318 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009319 continue;
9320 }
Victor Stinner6345be92011-11-25 20:09:01 +01009321
Victor Stinner42bf7752011-11-21 22:52:58 +01009322 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009323 exc = NULL;
9324 raise_encode_exception(&exc, "decimal", unicode,
9325 startpos, startpos+1,
9326 "invalid decimal Unicode string");
9327 Py_XDECREF(exc);
9328 Py_DECREF(unicode);
9329 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009330 }
9331 /* 0-terminate the output string */
9332 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009333 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009334 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009335}
9336
Guido van Rossumd57fd912000-03-10 22:53:23 +00009337/* --- Helpers ------------------------------------------------------------ */
9338
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009339/* helper macro to fixup start/end slice values */
9340#define ADJUST_INDICES(start, end, len) \
9341 if (end > len) \
9342 end = len; \
9343 else if (end < 0) { \
9344 end += len; \
9345 if (end < 0) \
9346 end = 0; \
9347 } \
9348 if (start < 0) { \
9349 start += len; \
9350 if (start < 0) \
9351 start = 0; \
9352 }
9353
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009354static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009355any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009356 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009357 Py_ssize_t end,
9358 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009359{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009360 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009361 void *buf1, *buf2;
9362 Py_ssize_t len1, len2, result;
9363
9364 kind1 = PyUnicode_KIND(s1);
9365 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009366 if (kind1 < kind2)
9367 return -1;
9368
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009369 len1 = PyUnicode_GET_LENGTH(s1);
9370 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009371 ADJUST_INDICES(start, end, len1);
9372 if (end - start < len2)
9373 return -1;
9374
9375 buf1 = PyUnicode_DATA(s1);
9376 buf2 = PyUnicode_DATA(s2);
9377 if (len2 == 1) {
9378 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9379 result = findchar((const char *)buf1 + kind1*start,
9380 kind1, end - start, ch, direction);
9381 if (result == -1)
9382 return -1;
9383 else
9384 return start + result;
9385 }
9386
9387 if (kind2 != kind1) {
9388 buf2 = _PyUnicode_AsKind(s2, kind1);
9389 if (!buf2)
9390 return -2;
9391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392
Victor Stinner794d5672011-10-10 03:21:36 +02009393 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009394 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009395 case PyUnicode_1BYTE_KIND:
9396 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9397 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9398 else
9399 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9400 break;
9401 case PyUnicode_2BYTE_KIND:
9402 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9403 break;
9404 case PyUnicode_4BYTE_KIND:
9405 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9406 break;
9407 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009408 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009409 }
9410 }
9411 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009412 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009413 case PyUnicode_1BYTE_KIND:
9414 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9415 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9416 else
9417 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9418 break;
9419 case PyUnicode_2BYTE_KIND:
9420 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9421 break;
9422 case PyUnicode_4BYTE_KIND:
9423 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9424 break;
9425 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009426 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009427 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009428 }
9429
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009430 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009431 PyMem_Free(buf2);
9432
9433 return result;
9434}
9435
Victor Stinner59423e32018-11-26 13:40:01 +01009436/* _PyUnicode_InsertThousandsGrouping() helper functions */
9437#include "stringlib/localeutil.h"
9438
9439/**
9440 * InsertThousandsGrouping:
9441 * @writer: Unicode writer.
9442 * @n_buffer: Number of characters in @buffer.
9443 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9444 * @d_pos: Start of digits string.
9445 * @n_digits: The number of digits in the string, in which we want
9446 * to put the grouping chars.
9447 * @min_width: The minimum width of the digits in the output string.
9448 * Output will be zero-padded on the left to fill.
9449 * @grouping: see definition in localeconv().
9450 * @thousands_sep: see definition in localeconv().
9451 *
9452 * There are 2 modes: counting and filling. If @writer is NULL,
9453 * we are in counting mode, else filling mode.
9454 * If counting, the required buffer size is returned.
9455 * If filling, we know the buffer will be large enough, so we don't
9456 * need to pass in the buffer size.
9457 * Inserts thousand grouping characters (as defined by grouping and
9458 * thousands_sep) into @writer.
9459 *
9460 * Return value: -1 on error, number of characters otherwise.
9461 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009462Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009463_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009464 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009465 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009466 PyObject *digits,
9467 Py_ssize_t d_pos,
9468 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009469 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009470 const char *grouping,
9471 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009472 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009473{
Xtreak3f7983a2019-01-07 20:39:14 +05309474 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009475 if (writer) {
9476 assert(digits != NULL);
9477 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009478 }
9479 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009480 assert(digits == NULL);
9481 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009482 }
Victor Stinner59423e32018-11-26 13:40:01 +01009483 assert(0 <= d_pos);
9484 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009485 assert(grouping != NULL);
9486
9487 if (digits != NULL) {
9488 if (PyUnicode_READY(digits) == -1) {
9489 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009490 }
Victor Stinner59423e32018-11-26 13:40:01 +01009491 }
9492 if (PyUnicode_READY(thousands_sep) == -1) {
9493 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009494 }
9495
Victor Stinner59423e32018-11-26 13:40:01 +01009496 Py_ssize_t count = 0;
9497 Py_ssize_t n_zeros;
9498 int loop_broken = 0;
9499 int use_separator = 0; /* First time through, don't append the
9500 separator. They only go between
9501 groups. */
9502 Py_ssize_t buffer_pos;
9503 Py_ssize_t digits_pos;
9504 Py_ssize_t len;
9505 Py_ssize_t n_chars;
9506 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9507 be looked at */
9508 /* A generator that returns all of the grouping widths, until it
9509 returns 0. */
9510 GroupGenerator groupgen;
9511 GroupGenerator_init(&groupgen, grouping);
9512 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9513
9514 /* if digits are not grouped, thousands separator
9515 should be an empty string */
9516 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9517
9518 digits_pos = d_pos + n_digits;
9519 if (writer) {
9520 buffer_pos = writer->pos + n_buffer;
9521 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9522 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009523 }
Victor Stinner59423e32018-11-26 13:40:01 +01009524 else {
9525 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009526 }
Victor Stinner59423e32018-11-26 13:40:01 +01009527
9528 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009529 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009530 }
Victor Stinner59423e32018-11-26 13:40:01 +01009531
9532 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9533 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9534 n_zeros = Py_MAX(0, len - remaining);
9535 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9536
9537 /* Use n_zero zero's and n_chars chars */
9538
9539 /* Count only, don't do anything. */
9540 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9541
9542 /* Copy into the writer. */
9543 InsertThousandsGrouping_fill(writer, &buffer_pos,
9544 digits, &digits_pos,
9545 n_chars, n_zeros,
9546 use_separator ? thousands_sep : NULL,
9547 thousands_sep_len, maxchar);
9548
9549 /* Use a separator next time. */
9550 use_separator = 1;
9551
9552 remaining -= n_chars;
9553 min_width -= len;
9554
9555 if (remaining <= 0 && min_width <= 0) {
9556 loop_broken = 1;
9557 break;
9558 }
9559 min_width -= thousands_sep_len;
9560 }
9561 if (!loop_broken) {
9562 /* We left the loop without using a break statement. */
9563
9564 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9565 n_zeros = Py_MAX(0, len - remaining);
9566 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9567
9568 /* Use n_zero zero's and n_chars chars */
9569 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9570
9571 /* Copy into the writer. */
9572 InsertThousandsGrouping_fill(writer, &buffer_pos,
9573 digits, &digits_pos,
9574 n_chars, n_zeros,
9575 use_separator ? thousands_sep : NULL,
9576 thousands_sep_len, maxchar);
9577 }
9578 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009579}
9580
9581
Alexander Belopolsky40018472011-02-26 01:02:56 +00009582Py_ssize_t
9583PyUnicode_Count(PyObject *str,
9584 PyObject *substr,
9585 Py_ssize_t start,
9586 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009587{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009588 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009589 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 void *buf1 = NULL, *buf2 = NULL;
9591 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009592
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009593 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009594 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009595
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009596 kind1 = PyUnicode_KIND(str);
9597 kind2 = PyUnicode_KIND(substr);
9598 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009599 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009600
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009601 len1 = PyUnicode_GET_LENGTH(str);
9602 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009604 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009605 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009606
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009607 buf1 = PyUnicode_DATA(str);
9608 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009609 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009610 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009611 if (!buf2)
9612 goto onError;
9613 }
9614
9615 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009616 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009617 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009618 result = asciilib_count(
9619 ((Py_UCS1*)buf1) + start, end - start,
9620 buf2, len2, PY_SSIZE_T_MAX
9621 );
9622 else
9623 result = ucs1lib_count(
9624 ((Py_UCS1*)buf1) + start, end - start,
9625 buf2, len2, PY_SSIZE_T_MAX
9626 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009627 break;
9628 case PyUnicode_2BYTE_KIND:
9629 result = ucs2lib_count(
9630 ((Py_UCS2*)buf1) + start, end - start,
9631 buf2, len2, PY_SSIZE_T_MAX
9632 );
9633 break;
9634 case PyUnicode_4BYTE_KIND:
9635 result = ucs4lib_count(
9636 ((Py_UCS4*)buf1) + start, end - start,
9637 buf2, len2, PY_SSIZE_T_MAX
9638 );
9639 break;
9640 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009641 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009642 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009643
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009644 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 PyMem_Free(buf2);
9646
Guido van Rossumd57fd912000-03-10 22:53:23 +00009647 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009649 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 PyMem_Free(buf2);
9651 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009652}
9653
Alexander Belopolsky40018472011-02-26 01:02:56 +00009654Py_ssize_t
9655PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009656 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009657 Py_ssize_t start,
9658 Py_ssize_t end,
9659 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009660{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009661 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009662 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009663
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009664 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665}
9666
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009667Py_ssize_t
9668PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9669 Py_ssize_t start, Py_ssize_t end,
9670 int direction)
9671{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009672 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009673 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 if (PyUnicode_READY(str) == -1)
9675 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009676 len = PyUnicode_GET_LENGTH(str);
9677 ADJUST_INDICES(start, end, len);
9678 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009679 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009680 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009681 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9682 kind, end-start, ch, direction);
9683 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009684 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009685 else
9686 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009687}
9688
Alexander Belopolsky40018472011-02-26 01:02:56 +00009689static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009690tailmatch(PyObject *self,
9691 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009692 Py_ssize_t start,
9693 Py_ssize_t end,
9694 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 int kind_self;
9697 int kind_sub;
9698 void *data_self;
9699 void *data_sub;
9700 Py_ssize_t offset;
9701 Py_ssize_t i;
9702 Py_ssize_t end_sub;
9703
9704 if (PyUnicode_READY(self) == -1 ||
9705 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009706 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009708 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9709 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009710 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009711 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009712
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009713 if (PyUnicode_GET_LENGTH(substring) == 0)
9714 return 1;
9715
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009716 kind_self = PyUnicode_KIND(self);
9717 data_self = PyUnicode_DATA(self);
9718 kind_sub = PyUnicode_KIND(substring);
9719 data_sub = PyUnicode_DATA(substring);
9720 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9721
9722 if (direction > 0)
9723 offset = end;
9724 else
9725 offset = start;
9726
9727 if (PyUnicode_READ(kind_self, data_self, offset) ==
9728 PyUnicode_READ(kind_sub, data_sub, 0) &&
9729 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9730 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9731 /* If both are of the same kind, memcmp is sufficient */
9732 if (kind_self == kind_sub) {
9733 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009734 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009735 data_sub,
9736 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009737 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009738 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009739 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740 else {
9741 /* We do not need to compare 0 and len(substring)-1 because
9742 the if statement above ensured already that they are equal
9743 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009744 for (i = 1; i < end_sub; ++i) {
9745 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9746 PyUnicode_READ(kind_sub, data_sub, i))
9747 return 0;
9748 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009749 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009751 }
9752
9753 return 0;
9754}
9755
Alexander Belopolsky40018472011-02-26 01:02:56 +00009756Py_ssize_t
9757PyUnicode_Tailmatch(PyObject *str,
9758 PyObject *substr,
9759 Py_ssize_t start,
9760 Py_ssize_t end,
9761 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009762{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009763 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009764 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009765
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009766 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009767}
9768
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009769static PyObject *
9770ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009771{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009772 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9773 char *resdata, *data = PyUnicode_DATA(self);
9774 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009775
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009776 res = PyUnicode_New(len, 127);
9777 if (res == NULL)
9778 return NULL;
9779 resdata = PyUnicode_DATA(res);
9780 if (lower)
9781 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009782 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009783 _Py_bytes_upper(resdata, data, len);
9784 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009785}
9786
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009787static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009788handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009789{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009790 Py_ssize_t j;
9791 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009792 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009793 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009794
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009795 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9796
9797 where ! is a negation and \p{xxx} is a character with property xxx.
9798 */
9799 for (j = i - 1; j >= 0; j--) {
9800 c = PyUnicode_READ(kind, data, j);
9801 if (!_PyUnicode_IsCaseIgnorable(c))
9802 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009803 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009804 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9805 if (final_sigma) {
9806 for (j = i + 1; j < length; j++) {
9807 c = PyUnicode_READ(kind, data, j);
9808 if (!_PyUnicode_IsCaseIgnorable(c))
9809 break;
9810 }
9811 final_sigma = j == length || !_PyUnicode_IsCased(c);
9812 }
9813 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009814}
9815
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816static int
9817lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9818 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009819{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009820 /* Obscure special case. */
9821 if (c == 0x3A3) {
9822 mapped[0] = handle_capital_sigma(kind, data, length, i);
9823 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009824 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009825 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009826}
9827
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009828static Py_ssize_t
9829do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009830{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009831 Py_ssize_t i, k = 0;
9832 int n_res, j;
9833 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009834
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009835 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009836 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009838 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009839 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009840 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009841 for (i = 1; i < length; i++) {
9842 c = PyUnicode_READ(kind, data, i);
9843 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9844 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009845 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009846 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009847 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009848 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009849 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009850}
9851
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009852static Py_ssize_t
9853do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9854 Py_ssize_t i, k = 0;
9855
9856 for (i = 0; i < length; i++) {
9857 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9858 int n_res, j;
9859 if (Py_UNICODE_ISUPPER(c)) {
9860 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9861 }
9862 else if (Py_UNICODE_ISLOWER(c)) {
9863 n_res = _PyUnicode_ToUpperFull(c, mapped);
9864 }
9865 else {
9866 n_res = 1;
9867 mapped[0] = c;
9868 }
9869 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009870 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009871 res[k++] = mapped[j];
9872 }
9873 }
9874 return k;
9875}
9876
9877static Py_ssize_t
9878do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9879 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009880{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881 Py_ssize_t i, k = 0;
9882
9883 for (i = 0; i < length; i++) {
9884 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9885 int n_res, j;
9886 if (lower)
9887 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9888 else
9889 n_res = _PyUnicode_ToUpperFull(c, mapped);
9890 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009891 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009892 res[k++] = mapped[j];
9893 }
9894 }
9895 return k;
9896}
9897
9898static Py_ssize_t
9899do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9900{
9901 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9902}
9903
9904static Py_ssize_t
9905do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9906{
9907 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9908}
9909
Benjamin Petersone51757f2012-01-12 21:10:29 -05009910static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009911do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9912{
9913 Py_ssize_t i, k = 0;
9914
9915 for (i = 0; i < length; i++) {
9916 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9917 Py_UCS4 mapped[3];
9918 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9919 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009920 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009921 res[k++] = mapped[j];
9922 }
9923 }
9924 return k;
9925}
9926
9927static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009928do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9929{
9930 Py_ssize_t i, k = 0;
9931 int previous_is_cased;
9932
9933 previous_is_cased = 0;
9934 for (i = 0; i < length; i++) {
9935 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9936 Py_UCS4 mapped[3];
9937 int n_res, j;
9938
9939 if (previous_is_cased)
9940 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9941 else
9942 n_res = _PyUnicode_ToTitleFull(c, mapped);
9943
9944 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009945 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009946 res[k++] = mapped[j];
9947 }
9948
9949 previous_is_cased = _PyUnicode_IsCased(c);
9950 }
9951 return k;
9952}
9953
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009954static PyObject *
9955case_operation(PyObject *self,
9956 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9957{
9958 PyObject *res = NULL;
9959 Py_ssize_t length, newlength = 0;
9960 int kind, outkind;
9961 void *data, *outdata;
9962 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9963
Benjamin Petersoneea48462012-01-16 14:28:50 -05009964 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009965
9966 kind = PyUnicode_KIND(self);
9967 data = PyUnicode_DATA(self);
9968 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009969 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009970 PyErr_SetString(PyExc_OverflowError, "string is too long");
9971 return NULL;
9972 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009973 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009974 if (tmp == NULL)
9975 return PyErr_NoMemory();
9976 newlength = perform(kind, data, length, tmp, &maxchar);
9977 res = PyUnicode_New(newlength, maxchar);
9978 if (res == NULL)
9979 goto leave;
9980 tmpend = tmp + newlength;
9981 outdata = PyUnicode_DATA(res);
9982 outkind = PyUnicode_KIND(res);
9983 switch (outkind) {
9984 case PyUnicode_1BYTE_KIND:
9985 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9986 break;
9987 case PyUnicode_2BYTE_KIND:
9988 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9989 break;
9990 case PyUnicode_4BYTE_KIND:
9991 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9992 break;
9993 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009994 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009995 }
9996 leave:
9997 PyMem_FREE(tmp);
9998 return res;
9999}
10000
Tim Peters8ce9f162004-08-27 01:49:32 +000010001PyObject *
10002PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010003{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010004 PyObject *res;
10005 PyObject *fseq;
10006 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010007 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010008
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010009 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010010 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010011 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010012 }
10013
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010014 /* NOTE: the following code can't call back into Python code,
10015 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010016 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010017
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010018 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010019 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010020 res = _PyUnicode_JoinArray(separator, items, seqlen);
10021 Py_DECREF(fseq);
10022 return res;
10023}
10024
10025PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010026_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010027{
10028 PyObject *res = NULL; /* the result */
10029 PyObject *sep = NULL;
10030 Py_ssize_t seplen;
10031 PyObject *item;
10032 Py_ssize_t sz, i, res_offset;
10033 Py_UCS4 maxchar;
10034 Py_UCS4 item_maxchar;
10035 int use_memcpy;
10036 unsigned char *res_data = NULL, *sep_data = NULL;
10037 PyObject *last_obj;
10038 unsigned int kind = 0;
10039
Tim Peters05eba1f2004-08-27 21:32:02 +000010040 /* If empty sequence, return u"". */
10041 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010042 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010043 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010044
Tim Peters05eba1f2004-08-27 21:32:02 +000010045 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010046 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010047 if (seqlen == 1) {
10048 if (PyUnicode_CheckExact(items[0])) {
10049 res = items[0];
10050 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010051 return res;
10052 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010053 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010054 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010055 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010056 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010057 /* Set up sep and seplen */
10058 if (separator == NULL) {
10059 /* fall back to a blank space separator */
10060 sep = PyUnicode_FromOrdinal(' ');
10061 if (!sep)
10062 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010063 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010064 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010065 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010066 else {
10067 if (!PyUnicode_Check(separator)) {
10068 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010069 "separator: expected str instance,"
10070 " %.80s found",
10071 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010072 goto onError;
10073 }
10074 if (PyUnicode_READY(separator))
10075 goto onError;
10076 sep = separator;
10077 seplen = PyUnicode_GET_LENGTH(separator);
10078 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10079 /* inc refcount to keep this code path symmetric with the
10080 above case of a blank separator */
10081 Py_INCREF(sep);
10082 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010083 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010084 }
10085
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010086 /* There are at least two things to join, or else we have a subclass
10087 * of str in the sequence.
10088 * Do a pre-pass to figure out the total amount of space we'll
10089 * need (sz), and see whether all argument are strings.
10090 */
10091 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010092#ifdef Py_DEBUG
10093 use_memcpy = 0;
10094#else
10095 use_memcpy = 1;
10096#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010097 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010098 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010099 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010100 if (!PyUnicode_Check(item)) {
10101 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010102 "sequence item %zd: expected str instance,"
10103 " %.80s found",
10104 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010105 goto onError;
10106 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 if (PyUnicode_READY(item) == -1)
10108 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010109 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010111 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010112 if (i != 0) {
10113 add_sz += seplen;
10114 }
10115 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010116 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010117 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010118 goto onError;
10119 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010120 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010121 if (use_memcpy && last_obj != NULL) {
10122 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10123 use_memcpy = 0;
10124 }
10125 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010126 }
Tim Petersced69f82003-09-16 20:30:58 +000010127
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010128 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010129 if (res == NULL)
10130 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010131
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010132 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010133#ifdef Py_DEBUG
10134 use_memcpy = 0;
10135#else
10136 if (use_memcpy) {
10137 res_data = PyUnicode_1BYTE_DATA(res);
10138 kind = PyUnicode_KIND(res);
10139 if (seplen != 0)
10140 sep_data = PyUnicode_1BYTE_DATA(sep);
10141 }
10142#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010143 if (use_memcpy) {
10144 for (i = 0; i < seqlen; ++i) {
10145 Py_ssize_t itemlen;
10146 item = items[i];
10147
10148 /* Copy item, and maybe the separator. */
10149 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010150 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010151 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010152 kind * seplen);
10153 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010154 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010155
10156 itemlen = PyUnicode_GET_LENGTH(item);
10157 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010158 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010159 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010160 kind * itemlen);
10161 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010162 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010163 }
10164 assert(res_data == PyUnicode_1BYTE_DATA(res)
10165 + kind * PyUnicode_GET_LENGTH(res));
10166 }
10167 else {
10168 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10169 Py_ssize_t itemlen;
10170 item = items[i];
10171
10172 /* Copy item, and maybe the separator. */
10173 if (i && seplen != 0) {
10174 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10175 res_offset += seplen;
10176 }
10177
10178 itemlen = PyUnicode_GET_LENGTH(item);
10179 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010180 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010181 res_offset += itemlen;
10182 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010183 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010184 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010185 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010186
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010187 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010188 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010190
Benjamin Peterson29060642009-01-31 22:14:21 +000010191 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010192 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010193 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010194 return NULL;
10195}
10196
Victor Stinnerd3f08822012-05-29 12:57:52 +020010197void
10198_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10199 Py_UCS4 fill_char)
10200{
10201 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010202 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010203 assert(PyUnicode_IS_READY(unicode));
10204 assert(unicode_modifiable(unicode));
10205 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10206 assert(start >= 0);
10207 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010208 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010209}
10210
Victor Stinner3fe55312012-01-04 00:33:50 +010010211Py_ssize_t
10212PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10213 Py_UCS4 fill_char)
10214{
10215 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010216
10217 if (!PyUnicode_Check(unicode)) {
10218 PyErr_BadInternalCall();
10219 return -1;
10220 }
10221 if (PyUnicode_READY(unicode) == -1)
10222 return -1;
10223 if (unicode_check_modifiable(unicode))
10224 return -1;
10225
Victor Stinnerd3f08822012-05-29 12:57:52 +020010226 if (start < 0) {
10227 PyErr_SetString(PyExc_IndexError, "string index out of range");
10228 return -1;
10229 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010230 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10231 PyErr_SetString(PyExc_ValueError,
10232 "fill character is bigger than "
10233 "the string maximum character");
10234 return -1;
10235 }
10236
10237 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10238 length = Py_MIN(maxlen, length);
10239 if (length <= 0)
10240 return 0;
10241
Victor Stinnerd3f08822012-05-29 12:57:52 +020010242 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010243 return length;
10244}
10245
Victor Stinner9310abb2011-10-05 00:59:23 +020010246static PyObject *
10247pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010248 Py_ssize_t left,
10249 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 PyObject *u;
10253 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010254 int kind;
10255 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256
10257 if (left < 0)
10258 left = 0;
10259 if (right < 0)
10260 right = 0;
10261
Victor Stinnerc4b49542011-12-11 22:44:26 +010010262 if (left == 0 && right == 0)
10263 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010264
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10266 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010267 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10268 return NULL;
10269 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010271 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010273 if (!u)
10274 return NULL;
10275
10276 kind = PyUnicode_KIND(u);
10277 data = PyUnicode_DATA(u);
10278 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010279 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010280 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010281 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010282 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010283 assert(_PyUnicode_CheckConsistency(u, 1));
10284 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010285}
10286
Alexander Belopolsky40018472011-02-26 01:02:56 +000010287PyObject *
10288PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010290 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010291
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010292 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010293 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010294
Benjamin Petersonead6b532011-12-20 17:23:42 -060010295 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010296 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010297 if (PyUnicode_IS_ASCII(string))
10298 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010299 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010300 PyUnicode_GET_LENGTH(string), keepends);
10301 else
10302 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010303 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010304 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 break;
10306 case PyUnicode_2BYTE_KIND:
10307 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010309 PyUnicode_GET_LENGTH(string), keepends);
10310 break;
10311 case PyUnicode_4BYTE_KIND:
10312 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 PyUnicode_GET_LENGTH(string), keepends);
10315 break;
10316 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010317 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010319 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010320}
10321
Alexander Belopolsky40018472011-02-26 01:02:56 +000010322static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010323split(PyObject *self,
10324 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010325 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010326{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010327 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 void *buf1, *buf2;
10329 Py_ssize_t len1, len2;
10330 PyObject* out;
10331
Guido van Rossumd57fd912000-03-10 22:53:23 +000010332 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010333 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010334
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 if (PyUnicode_READY(self) == -1)
10336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010337
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010339 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010341 if (PyUnicode_IS_ASCII(self))
10342 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010343 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010344 PyUnicode_GET_LENGTH(self), maxcount
10345 );
10346 else
10347 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 PyUnicode_GET_LENGTH(self), maxcount
10350 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 case PyUnicode_2BYTE_KIND:
10352 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010354 PyUnicode_GET_LENGTH(self), maxcount
10355 );
10356 case PyUnicode_4BYTE_KIND:
10357 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010362 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 }
10364
10365 if (PyUnicode_READY(substring) == -1)
10366 return NULL;
10367
10368 kind1 = PyUnicode_KIND(self);
10369 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 len1 = PyUnicode_GET_LENGTH(self);
10371 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010372 if (kind1 < kind2 || len1 < len2) {
10373 out = PyList_New(1);
10374 if (out == NULL)
10375 return NULL;
10376 Py_INCREF(self);
10377 PyList_SET_ITEM(out, 0, self);
10378 return out;
10379 }
10380 buf1 = PyUnicode_DATA(self);
10381 buf2 = PyUnicode_DATA(substring);
10382 if (kind2 != kind1) {
10383 buf2 = _PyUnicode_AsKind(substring, kind1);
10384 if (!buf2)
10385 return NULL;
10386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010388 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010389 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010390 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10391 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010392 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010393 else
10394 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010395 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 break;
10397 case PyUnicode_2BYTE_KIND:
10398 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010400 break;
10401 case PyUnicode_4BYTE_KIND:
10402 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010403 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 break;
10405 default:
10406 out = NULL;
10407 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010408 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 PyMem_Free(buf2);
10410 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010411}
10412
Alexander Belopolsky40018472011-02-26 01:02:56 +000010413static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010414rsplit(PyObject *self,
10415 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010416 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010417{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010418 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 void *buf1, *buf2;
10420 Py_ssize_t len1, len2;
10421 PyObject* out;
10422
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010423 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010424 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010425
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010426 if (PyUnicode_READY(self) == -1)
10427 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010428
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010430 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010432 if (PyUnicode_IS_ASCII(self))
10433 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010434 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010435 PyUnicode_GET_LENGTH(self), maxcount
10436 );
10437 else
10438 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010439 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010440 PyUnicode_GET_LENGTH(self), maxcount
10441 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 case PyUnicode_2BYTE_KIND:
10443 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010444 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010445 PyUnicode_GET_LENGTH(self), maxcount
10446 );
10447 case PyUnicode_4BYTE_KIND:
10448 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010449 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010450 PyUnicode_GET_LENGTH(self), maxcount
10451 );
10452 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010453 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010454 }
10455
10456 if (PyUnicode_READY(substring) == -1)
10457 return NULL;
10458
10459 kind1 = PyUnicode_KIND(self);
10460 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010461 len1 = PyUnicode_GET_LENGTH(self);
10462 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010463 if (kind1 < kind2 || len1 < len2) {
10464 out = PyList_New(1);
10465 if (out == NULL)
10466 return NULL;
10467 Py_INCREF(self);
10468 PyList_SET_ITEM(out, 0, self);
10469 return out;
10470 }
10471 buf1 = PyUnicode_DATA(self);
10472 buf2 = PyUnicode_DATA(substring);
10473 if (kind2 != kind1) {
10474 buf2 = _PyUnicode_AsKind(substring, kind1);
10475 if (!buf2)
10476 return NULL;
10477 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010479 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010481 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10482 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010483 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010484 else
10485 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010486 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 break;
10488 case PyUnicode_2BYTE_KIND:
10489 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010490 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010491 break;
10492 case PyUnicode_4BYTE_KIND:
10493 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010494 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 break;
10496 default:
10497 out = NULL;
10498 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010499 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500 PyMem_Free(buf2);
10501 return out;
10502}
10503
10504static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010505anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10506 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010507{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010508 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010510 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10511 return asciilib_find(buf1, len1, buf2, len2, offset);
10512 else
10513 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 case PyUnicode_2BYTE_KIND:
10515 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10516 case PyUnicode_4BYTE_KIND:
10517 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10518 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010519 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520}
10521
10522static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010523anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10524 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010525{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010526 switch (kind) {
10527 case PyUnicode_1BYTE_KIND:
10528 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10529 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10530 else
10531 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10532 case PyUnicode_2BYTE_KIND:
10533 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10534 case PyUnicode_4BYTE_KIND:
10535 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10536 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010537 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010538}
10539
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010540static void
10541replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10542 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10543{
10544 int kind = PyUnicode_KIND(u);
10545 void *data = PyUnicode_DATA(u);
10546 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10547 if (kind == PyUnicode_1BYTE_KIND) {
10548 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10549 (Py_UCS1 *)data + len,
10550 u1, u2, maxcount);
10551 }
10552 else if (kind == PyUnicode_2BYTE_KIND) {
10553 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10554 (Py_UCS2 *)data + len,
10555 u1, u2, maxcount);
10556 }
10557 else {
10558 assert(kind == PyUnicode_4BYTE_KIND);
10559 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10560 (Py_UCS4 *)data + len,
10561 u1, u2, maxcount);
10562 }
10563}
10564
Alexander Belopolsky40018472011-02-26 01:02:56 +000010565static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566replace(PyObject *self, PyObject *str1,
10567 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010568{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010569 PyObject *u;
10570 char *sbuf = PyUnicode_DATA(self);
10571 char *buf1 = PyUnicode_DATA(str1);
10572 char *buf2 = PyUnicode_DATA(str2);
10573 int srelease = 0, release1 = 0, release2 = 0;
10574 int skind = PyUnicode_KIND(self);
10575 int kind1 = PyUnicode_KIND(str1);
10576 int kind2 = PyUnicode_KIND(str2);
10577 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10578 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10579 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010580 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010581 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010582
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010583 if (slen < len1)
10584 goto nothing;
10585
Guido van Rossumd57fd912000-03-10 22:53:23 +000010586 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010587 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010588 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010589 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010590
Victor Stinner59de0ee2011-10-07 10:01:28 +020010591 if (str1 == str2)
10592 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593
Victor Stinner49a0a212011-10-12 23:46:10 +020010594 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010595 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10596 if (maxchar < maxchar_str1)
10597 /* substring too wide to be present */
10598 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010599 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10600 /* Replacing str1 with str2 may cause a maxchar reduction in the
10601 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010602 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010603 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010606 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010608 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010609 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010611 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010612 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010613
Victor Stinner69ed0f42013-04-09 21:48:24 +020010614 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010615 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010616 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010617 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010618 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010619 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010620 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010621 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010622
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010623 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10624 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010625 }
10626 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 int rkind = skind;
10628 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010629 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 if (kind1 < rkind) {
10632 /* widen substring */
10633 buf1 = _PyUnicode_AsKind(str1, rkind);
10634 if (!buf1) goto error;
10635 release1 = 1;
10636 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010637 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010638 if (i < 0)
10639 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 if (rkind > kind2) {
10641 /* widen replacement */
10642 buf2 = _PyUnicode_AsKind(str2, rkind);
10643 if (!buf2) goto error;
10644 release2 = 1;
10645 }
10646 else if (rkind < kind2) {
10647 /* widen self and buf1 */
10648 rkind = kind2;
10649 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010650 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010651 sbuf = _PyUnicode_AsKind(self, rkind);
10652 if (!sbuf) goto error;
10653 srelease = 1;
10654 buf1 = _PyUnicode_AsKind(str1, rkind);
10655 if (!buf1) goto error;
10656 release1 = 1;
10657 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010658 u = PyUnicode_New(slen, maxchar);
10659 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010661 assert(PyUnicode_KIND(u) == rkind);
10662 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010663
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010665 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010670
10671 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010672 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010674 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010675 if (i == -1)
10676 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010677 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010679 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010681 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010682 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010683 }
10684 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010685 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010686 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010687 int rkind = skind;
10688 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010691 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 buf1 = _PyUnicode_AsKind(str1, rkind);
10693 if (!buf1) goto error;
10694 release1 = 1;
10695 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010696 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010697 if (n == 0)
10698 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010699 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010700 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010701 buf2 = _PyUnicode_AsKind(str2, rkind);
10702 if (!buf2) goto error;
10703 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010704 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010706 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010707 rkind = kind2;
10708 sbuf = _PyUnicode_AsKind(self, rkind);
10709 if (!sbuf) goto error;
10710 srelease = 1;
10711 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010712 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010713 buf1 = _PyUnicode_AsKind(str1, rkind);
10714 if (!buf1) goto error;
10715 release1 = 1;
10716 }
10717 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10718 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010719 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010720 PyErr_SetString(PyExc_OverflowError,
10721 "replace string is too long");
10722 goto error;
10723 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010724 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010725 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010726 _Py_INCREF_UNICODE_EMPTY();
10727 if (!unicode_empty)
10728 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010729 u = unicode_empty;
10730 goto done;
10731 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010732 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 PyErr_SetString(PyExc_OverflowError,
10734 "replace string is too long");
10735 goto error;
10736 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010737 u = PyUnicode_New(new_size, maxchar);
10738 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010739 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010740 assert(PyUnicode_KIND(u) == rkind);
10741 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010742 ires = i = 0;
10743 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010744 while (n-- > 0) {
10745 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010746 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010747 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010748 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010749 if (j == -1)
10750 break;
10751 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010752 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010753 memcpy(res + rkind * ires,
10754 sbuf + rkind * i,
10755 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010756 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010757 }
10758 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010760 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010761 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010762 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010763 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010764 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010765 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010766 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010767 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010768 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010769 memcpy(res + rkind * ires,
10770 sbuf + rkind * i,
10771 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010772 }
10773 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010774 /* interleave */
10775 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010776 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010777 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010778 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010779 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010780 if (--n <= 0)
10781 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010782 memcpy(res + rkind * ires,
10783 sbuf + rkind * i,
10784 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010785 ires++;
10786 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010787 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010788 memcpy(res + rkind * ires,
10789 sbuf + rkind * i,
10790 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010791 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010792 }
10793
10794 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010795 unicode_adjust_maxchar(&u);
10796 if (u == NULL)
10797 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010798 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010799
10800 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010801 if (srelease)
10802 PyMem_FREE(sbuf);
10803 if (release1)
10804 PyMem_FREE(buf1);
10805 if (release2)
10806 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010807 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010809
Benjamin Peterson29060642009-01-31 22:14:21 +000010810 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010811 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010812 if (srelease)
10813 PyMem_FREE(sbuf);
10814 if (release1)
10815 PyMem_FREE(buf1);
10816 if (release2)
10817 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010818 return unicode_result_unchanged(self);
10819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010820 error:
10821 if (srelease && sbuf)
10822 PyMem_FREE(sbuf);
10823 if (release1 && buf1)
10824 PyMem_FREE(buf1);
10825 if (release2 && buf2)
10826 PyMem_FREE(buf2);
10827 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010828}
10829
10830/* --- Unicode Object Methods --------------------------------------------- */
10831
INADA Naoki3ae20562017-01-16 20:41:20 +090010832/*[clinic input]
10833str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
INADA Naoki3ae20562017-01-16 20:41:20 +090010835Return a version of the string where each word is titlecased.
10836
10837More specifically, words start with uppercased characters and all remaining
10838cased characters have lower case.
10839[clinic start generated code]*/
10840
10841static PyObject *
10842unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010843/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010845 if (PyUnicode_READY(self) == -1)
10846 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010847 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848}
10849
INADA Naoki3ae20562017-01-16 20:41:20 +090010850/*[clinic input]
10851str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852
INADA Naoki3ae20562017-01-16 20:41:20 +090010853Return a capitalized version of the string.
10854
10855More specifically, make the first character have upper case and the rest lower
10856case.
10857[clinic start generated code]*/
10858
10859static PyObject *
10860unicode_capitalize_impl(PyObject *self)
10861/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010862{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010863 if (PyUnicode_READY(self) == -1)
10864 return NULL;
10865 if (PyUnicode_GET_LENGTH(self) == 0)
10866 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010867 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010868}
10869
INADA Naoki3ae20562017-01-16 20:41:20 +090010870/*[clinic input]
10871str.casefold as unicode_casefold
10872
10873Return a version of the string suitable for caseless comparisons.
10874[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010875
10876static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010877unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010878/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010879{
10880 if (PyUnicode_READY(self) == -1)
10881 return NULL;
10882 if (PyUnicode_IS_ASCII(self))
10883 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010884 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010885}
10886
10887
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010888/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010889
10890static int
10891convert_uc(PyObject *obj, void *addr)
10892{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010893 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010894
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010895 if (!PyUnicode_Check(obj)) {
10896 PyErr_Format(PyExc_TypeError,
10897 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010898 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010899 return 0;
10900 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010901 if (PyUnicode_READY(obj) < 0)
10902 return 0;
10903 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010904 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010905 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010906 return 0;
10907 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010908 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010909 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010910}
10911
INADA Naoki3ae20562017-01-16 20:41:20 +090010912/*[clinic input]
10913str.center as unicode_center
10914
10915 width: Py_ssize_t
10916 fillchar: Py_UCS4 = ' '
10917 /
10918
10919Return a centered string of length width.
10920
10921Padding is done using the specified fill character (default is a space).
10922[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010923
10924static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010925unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10926/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010927{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010928 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010929
Benjamin Petersonbac79492012-01-14 13:34:47 -050010930 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010931 return NULL;
10932
Victor Stinnerc4b49542011-12-11 22:44:26 +010010933 if (PyUnicode_GET_LENGTH(self) >= width)
10934 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010935
Victor Stinnerc4b49542011-12-11 22:44:26 +010010936 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010937 left = marg / 2 + (marg & width & 1);
10938
Victor Stinner9310abb2011-10-05 00:59:23 +020010939 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010940}
10941
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010942/* This function assumes that str1 and str2 are readied by the caller. */
10943
Marc-André Lemburge5034372000-08-08 08:04:29 +000010944static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010945unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010946{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010947#define COMPARE(TYPE1, TYPE2) \
10948 do { \
10949 TYPE1* p1 = (TYPE1 *)data1; \
10950 TYPE2* p2 = (TYPE2 *)data2; \
10951 TYPE1* end = p1 + len; \
10952 Py_UCS4 c1, c2; \
10953 for (; p1 != end; p1++, p2++) { \
10954 c1 = *p1; \
10955 c2 = *p2; \
10956 if (c1 != c2) \
10957 return (c1 < c2) ? -1 : 1; \
10958 } \
10959 } \
10960 while (0)
10961
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010962 int kind1, kind2;
10963 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010964 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010965
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010966 kind1 = PyUnicode_KIND(str1);
10967 kind2 = PyUnicode_KIND(str2);
10968 data1 = PyUnicode_DATA(str1);
10969 data2 = PyUnicode_DATA(str2);
10970 len1 = PyUnicode_GET_LENGTH(str1);
10971 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010972 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010973
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010974 switch(kind1) {
10975 case PyUnicode_1BYTE_KIND:
10976 {
10977 switch(kind2) {
10978 case PyUnicode_1BYTE_KIND:
10979 {
10980 int cmp = memcmp(data1, data2, len);
10981 /* normalize result of memcmp() into the range [-1; 1] */
10982 if (cmp < 0)
10983 return -1;
10984 if (cmp > 0)
10985 return 1;
10986 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010987 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010988 case PyUnicode_2BYTE_KIND:
10989 COMPARE(Py_UCS1, Py_UCS2);
10990 break;
10991 case PyUnicode_4BYTE_KIND:
10992 COMPARE(Py_UCS1, Py_UCS4);
10993 break;
10994 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010995 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010996 }
10997 break;
10998 }
10999 case PyUnicode_2BYTE_KIND:
11000 {
11001 switch(kind2) {
11002 case PyUnicode_1BYTE_KIND:
11003 COMPARE(Py_UCS2, Py_UCS1);
11004 break;
11005 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011006 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011007 COMPARE(Py_UCS2, Py_UCS2);
11008 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011009 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011010 case PyUnicode_4BYTE_KIND:
11011 COMPARE(Py_UCS2, Py_UCS4);
11012 break;
11013 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011014 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011015 }
11016 break;
11017 }
11018 case PyUnicode_4BYTE_KIND:
11019 {
11020 switch(kind2) {
11021 case PyUnicode_1BYTE_KIND:
11022 COMPARE(Py_UCS4, Py_UCS1);
11023 break;
11024 case PyUnicode_2BYTE_KIND:
11025 COMPARE(Py_UCS4, Py_UCS2);
11026 break;
11027 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011028 {
11029#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11030 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11031 /* normalize result of wmemcmp() into the range [-1; 1] */
11032 if (cmp < 0)
11033 return -1;
11034 if (cmp > 0)
11035 return 1;
11036#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011037 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011038#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011039 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011040 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011041 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011042 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011043 }
11044 break;
11045 }
11046 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011047 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011048 }
11049
Victor Stinner770e19e2012-10-04 22:59:45 +020011050 if (len1 == len2)
11051 return 0;
11052 if (len1 < len2)
11053 return -1;
11054 else
11055 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011056
11057#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011058}
11059
Benjamin Peterson621b4302016-09-09 13:54:34 -070011060static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011061unicode_compare_eq(PyObject *str1, PyObject *str2)
11062{
11063 int kind;
11064 void *data1, *data2;
11065 Py_ssize_t len;
11066 int cmp;
11067
Victor Stinnere5567ad2012-10-23 02:48:49 +020011068 len = PyUnicode_GET_LENGTH(str1);
11069 if (PyUnicode_GET_LENGTH(str2) != len)
11070 return 0;
11071 kind = PyUnicode_KIND(str1);
11072 if (PyUnicode_KIND(str2) != kind)
11073 return 0;
11074 data1 = PyUnicode_DATA(str1);
11075 data2 = PyUnicode_DATA(str2);
11076
11077 cmp = memcmp(data1, data2, len * kind);
11078 return (cmp == 0);
11079}
11080
11081
Alexander Belopolsky40018472011-02-26 01:02:56 +000011082int
11083PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011084{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011085 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11086 if (PyUnicode_READY(left) == -1 ||
11087 PyUnicode_READY(right) == -1)
11088 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011089
11090 /* a string is equal to itself */
11091 if (left == right)
11092 return 0;
11093
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011094 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011095 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011096 PyErr_Format(PyExc_TypeError,
11097 "Can't compare %.100s and %.100s",
11098 left->ob_type->tp_name,
11099 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011100 return -1;
11101}
11102
Martin v. Löwis5b222132007-06-10 09:51:05 +000011103int
11104PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011106 Py_ssize_t i;
11107 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011108 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011109 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011110
Victor Stinner910337b2011-10-03 03:20:16 +020011111 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011112 if (!PyUnicode_IS_READY(uni)) {
11113 const wchar_t *ws = _PyUnicode_WSTR(uni);
11114 /* Compare Unicode string and source character set string */
11115 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11116 if (chr != ustr[i])
11117 return (chr < ustr[i]) ? -1 : 1;
11118 }
11119 /* This check keeps Python strings that end in '\0' from comparing equal
11120 to C strings identical up to that point. */
11121 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11122 return 1; /* uni is longer */
11123 if (ustr[i])
11124 return -1; /* str is longer */
11125 return 0;
11126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011127 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011128 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011129 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011130 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011131 size_t len, len2 = strlen(str);
11132 int cmp;
11133
11134 len = Py_MIN(len1, len2);
11135 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011136 if (cmp != 0) {
11137 if (cmp < 0)
11138 return -1;
11139 else
11140 return 1;
11141 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011142 if (len1 > len2)
11143 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011144 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011145 return -1; /* str is longer */
11146 return 0;
11147 }
11148 else {
11149 void *data = PyUnicode_DATA(uni);
11150 /* Compare Unicode string and source character set string */
11151 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011152 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011153 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11154 /* This check keeps Python strings that end in '\0' from comparing equal
11155 to C strings identical up to that point. */
11156 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11157 return 1; /* uni is longer */
11158 if (str[i])
11159 return -1; /* str is longer */
11160 return 0;
11161 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011162}
11163
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011164static int
11165non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11166{
11167 size_t i, len;
11168 const wchar_t *p;
11169 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11170 if (strlen(str) != len)
11171 return 0;
11172 p = _PyUnicode_WSTR(unicode);
11173 assert(p);
11174 for (i = 0; i < len; i++) {
11175 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011176 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011177 return 0;
11178 }
11179 return 1;
11180}
11181
11182int
11183_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11184{
11185 size_t len;
11186 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011187 assert(str);
11188#ifndef NDEBUG
11189 for (const char *p = str; *p; p++) {
11190 assert((unsigned char)*p < 128);
11191 }
11192#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011193 if (PyUnicode_READY(unicode) == -1) {
11194 /* Memory error or bad data */
11195 PyErr_Clear();
11196 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11197 }
11198 if (!PyUnicode_IS_ASCII(unicode))
11199 return 0;
11200 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11201 return strlen(str) == len &&
11202 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11203}
11204
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011205int
11206_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11207{
11208 PyObject *right_uni;
11209 Py_hash_t hash;
11210
11211 assert(_PyUnicode_CHECK(left));
11212 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011213#ifndef NDEBUG
11214 for (const char *p = right->string; *p; p++) {
11215 assert((unsigned char)*p < 128);
11216 }
11217#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011218
11219 if (PyUnicode_READY(left) == -1) {
11220 /* memory error or bad data */
11221 PyErr_Clear();
11222 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11223 }
11224
11225 if (!PyUnicode_IS_ASCII(left))
11226 return 0;
11227
11228 right_uni = _PyUnicode_FromId(right); /* borrowed */
11229 if (right_uni == NULL) {
11230 /* memory error or bad data */
11231 PyErr_Clear();
11232 return _PyUnicode_EqualToASCIIString(left, right->string);
11233 }
11234
11235 if (left == right_uni)
11236 return 1;
11237
11238 if (PyUnicode_CHECK_INTERNED(left))
11239 return 0;
11240
INADA Naoki7cc95f52018-01-28 02:07:09 +090011241 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011242 hash = _PyUnicode_HASH(left);
11243 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11244 return 0;
11245
11246 return unicode_compare_eq(left, right_uni);
11247}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011248
Alexander Belopolsky40018472011-02-26 01:02:56 +000011249PyObject *
11250PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011251{
11252 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011253
Victor Stinnere5567ad2012-10-23 02:48:49 +020011254 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11255 Py_RETURN_NOTIMPLEMENTED;
11256
11257 if (PyUnicode_READY(left) == -1 ||
11258 PyUnicode_READY(right) == -1)
11259 return NULL;
11260
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011261 if (left == right) {
11262 switch (op) {
11263 case Py_EQ:
11264 case Py_LE:
11265 case Py_GE:
11266 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011267 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011268 case Py_NE:
11269 case Py_LT:
11270 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011271 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011272 default:
11273 PyErr_BadArgument();
11274 return NULL;
11275 }
11276 }
11277 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011278 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011279 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011280 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011281 }
11282 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011283 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011284 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011285 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011286}
11287
Alexander Belopolsky40018472011-02-26 01:02:56 +000011288int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011289_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11290{
11291 return unicode_eq(aa, bb);
11292}
11293
11294int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011296{
Victor Stinner77282cb2013-04-14 19:22:47 +020011297 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011298 void *buf1, *buf2;
11299 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011300 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011301
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011303 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011304 "'in <string>' requires string as left operand, not %.100s",
11305 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011306 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011307 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011309 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011310 if (ensure_unicode(str) < 0)
11311 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011312
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011313 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011314 kind2 = PyUnicode_KIND(substr);
11315 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011316 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011317 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011318 len2 = PyUnicode_GET_LENGTH(substr);
11319 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011320 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011321 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011322 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011323 if (len2 == 1) {
11324 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11325 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011326 return result;
11327 }
11328 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011329 buf2 = _PyUnicode_AsKind(substr, kind1);
11330 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011331 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011333
Victor Stinner77282cb2013-04-14 19:22:47 +020011334 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011335 case PyUnicode_1BYTE_KIND:
11336 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11337 break;
11338 case PyUnicode_2BYTE_KIND:
11339 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11340 break;
11341 case PyUnicode_4BYTE_KIND:
11342 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11343 break;
11344 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011345 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011346 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011347
Victor Stinner77282cb2013-04-14 19:22:47 +020011348 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011349 PyMem_Free(buf2);
11350
Guido van Rossum403d68b2000-03-13 15:55:09 +000011351 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011352}
11353
Guido van Rossumd57fd912000-03-10 22:53:23 +000011354/* Concat to string or Unicode object giving a new Unicode object. */
11355
Alexander Belopolsky40018472011-02-26 01:02:56 +000011356PyObject *
11357PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011358{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011359 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011360 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011361 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011362
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011363 if (ensure_unicode(left) < 0)
11364 return NULL;
11365
11366 if (!PyUnicode_Check(right)) {
11367 PyErr_Format(PyExc_TypeError,
11368 "can only concatenate str (not \"%.200s\") to str",
11369 right->ob_type->tp_name);
11370 return NULL;
11371 }
11372 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011373 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011374
11375 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011376 if (left == unicode_empty)
11377 return PyUnicode_FromObject(right);
11378 if (right == unicode_empty)
11379 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011380
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011381 left_len = PyUnicode_GET_LENGTH(left);
11382 right_len = PyUnicode_GET_LENGTH(right);
11383 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011384 PyErr_SetString(PyExc_OverflowError,
11385 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011386 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011387 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011388 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011389
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011390 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11391 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011392 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011393
Guido van Rossumd57fd912000-03-10 22:53:23 +000011394 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011395 result = PyUnicode_New(new_len, maxchar);
11396 if (result == NULL)
11397 return NULL;
11398 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11399 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11400 assert(_PyUnicode_CheckConsistency(result, 1));
11401 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011402}
11403
Walter Dörwald1ab83302007-05-18 17:15:44 +000011404void
Victor Stinner23e56682011-10-03 03:54:37 +020011405PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011406{
Victor Stinner23e56682011-10-03 03:54:37 +020011407 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011408 Py_UCS4 maxchar, maxchar2;
11409 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011410
11411 if (p_left == NULL) {
11412 if (!PyErr_Occurred())
11413 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011414 return;
11415 }
Victor Stinner23e56682011-10-03 03:54:37 +020011416 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011417 if (right == NULL || left == NULL
11418 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011419 if (!PyErr_Occurred())
11420 PyErr_BadInternalCall();
11421 goto error;
11422 }
11423
Benjamin Petersonbac79492012-01-14 13:34:47 -050011424 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011425 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011426 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011427 goto error;
11428
Victor Stinner488fa492011-12-12 00:01:39 +010011429 /* Shortcuts */
11430 if (left == unicode_empty) {
11431 Py_DECREF(left);
11432 Py_INCREF(right);
11433 *p_left = right;
11434 return;
11435 }
11436 if (right == unicode_empty)
11437 return;
11438
11439 left_len = PyUnicode_GET_LENGTH(left);
11440 right_len = PyUnicode_GET_LENGTH(right);
11441 if (left_len > PY_SSIZE_T_MAX - right_len) {
11442 PyErr_SetString(PyExc_OverflowError,
11443 "strings are too large to concat");
11444 goto error;
11445 }
11446 new_len = left_len + right_len;
11447
11448 if (unicode_modifiable(left)
11449 && PyUnicode_CheckExact(right)
11450 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011451 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11452 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011453 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011454 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011455 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11456 {
11457 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011458 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011459 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011460
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011461 /* copy 'right' into the newly allocated area of 'left' */
11462 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011463 }
Victor Stinner488fa492011-12-12 00:01:39 +010011464 else {
11465 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11466 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011467 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011468
Victor Stinner488fa492011-12-12 00:01:39 +010011469 /* Concat the two Unicode strings */
11470 res = PyUnicode_New(new_len, maxchar);
11471 if (res == NULL)
11472 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011473 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11474 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011475 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011476 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011477 }
11478 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011479 return;
11480
11481error:
Victor Stinner488fa492011-12-12 00:01:39 +010011482 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011483}
11484
11485void
11486PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11487{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011488 PyUnicode_Append(pleft, right);
11489 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011490}
11491
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011492/*
11493Wraps stringlib_parse_args_finds() and additionally ensures that the
11494first argument is a unicode object.
11495*/
11496
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011497static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011498parse_args_finds_unicode(const char * function_name, PyObject *args,
11499 PyObject **substring,
11500 Py_ssize_t *start, Py_ssize_t *end)
11501{
11502 if(stringlib_parse_args_finds(function_name, args, substring,
11503 start, end)) {
11504 if (ensure_unicode(*substring) < 0)
11505 return 0;
11506 return 1;
11507 }
11508 return 0;
11509}
11510
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011511PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011512 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011513\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011514Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011515string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011516interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011517
11518static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011519unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011521 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011522 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011523 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011524 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011525 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011526 void *buf1, *buf2;
11527 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011528
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011529 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011530 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011531
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011532 kind1 = PyUnicode_KIND(self);
11533 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011534 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011535 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011536
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011537 len1 = PyUnicode_GET_LENGTH(self);
11538 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011539 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011540 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011541 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011542
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011543 buf1 = PyUnicode_DATA(self);
11544 buf2 = PyUnicode_DATA(substring);
11545 if (kind2 != kind1) {
11546 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011547 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011548 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011549 }
11550 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011551 case PyUnicode_1BYTE_KIND:
11552 iresult = ucs1lib_count(
11553 ((Py_UCS1*)buf1) + start, end - start,
11554 buf2, len2, PY_SSIZE_T_MAX
11555 );
11556 break;
11557 case PyUnicode_2BYTE_KIND:
11558 iresult = ucs2lib_count(
11559 ((Py_UCS2*)buf1) + start, end - start,
11560 buf2, len2, PY_SSIZE_T_MAX
11561 );
11562 break;
11563 case PyUnicode_4BYTE_KIND:
11564 iresult = ucs4lib_count(
11565 ((Py_UCS4*)buf1) + start, end - start,
11566 buf2, len2, PY_SSIZE_T_MAX
11567 );
11568 break;
11569 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011570 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 }
11572
11573 result = PyLong_FromSsize_t(iresult);
11574
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011575 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578 return result;
11579}
11580
INADA Naoki3ae20562017-01-16 20:41:20 +090011581/*[clinic input]
11582str.encode as unicode_encode
11583
11584 encoding: str(c_default="NULL") = 'utf-8'
11585 The encoding in which to encode the string.
11586 errors: str(c_default="NULL") = 'strict'
11587 The error handling scheme to use for encoding errors.
11588 The default is 'strict' meaning that encoding errors raise a
11589 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11590 'xmlcharrefreplace' as well as any other name registered with
11591 codecs.register_error that can handle UnicodeEncodeErrors.
11592
11593Encode the string using the codec registered for encoding.
11594[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011595
11596static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011597unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011598/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011599{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011600 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011601}
11602
INADA Naoki3ae20562017-01-16 20:41:20 +090011603/*[clinic input]
11604str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011605
INADA Naoki3ae20562017-01-16 20:41:20 +090011606 tabsize: int = 8
11607
11608Return a copy where all tab characters are expanded using spaces.
11609
11610If tabsize is not given, a tab size of 8 characters is assumed.
11611[clinic start generated code]*/
11612
11613static PyObject *
11614unicode_expandtabs_impl(PyObject *self, int tabsize)
11615/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011617 Py_ssize_t i, j, line_pos, src_len, incr;
11618 Py_UCS4 ch;
11619 PyObject *u;
11620 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011621 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011622 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Antoine Pitrou22425222011-10-04 19:10:51 +020011624 if (PyUnicode_READY(self) == -1)
11625 return NULL;
11626
Thomas Wouters7e474022000-07-16 12:04:32 +000011627 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011628 src_len = PyUnicode_GET_LENGTH(self);
11629 i = j = line_pos = 0;
11630 kind = PyUnicode_KIND(self);
11631 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011632 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011633 for (; i < src_len; i++) {
11634 ch = PyUnicode_READ(kind, src_data, i);
11635 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011636 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011637 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011638 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011639 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011640 goto overflow;
11641 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011642 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011643 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011644 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011645 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011646 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011647 goto overflow;
11648 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011649 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011650 if (ch == '\n' || ch == '\r')
11651 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011653 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011654 if (!found)
11655 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011656
Guido van Rossumd57fd912000-03-10 22:53:23 +000011657 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011658 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011659 if (!u)
11660 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011661 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662
Antoine Pitroue71d5742011-10-04 15:55:09 +020011663 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664
Antoine Pitroue71d5742011-10-04 15:55:09 +020011665 for (; i < src_len; i++) {
11666 ch = PyUnicode_READ(kind, src_data, i);
11667 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011668 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011669 incr = tabsize - (line_pos % tabsize);
11670 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011671 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011672 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011673 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011674 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011675 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011676 line_pos++;
11677 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011678 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011679 if (ch == '\n' || ch == '\r')
11680 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011682 }
11683 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011684 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011685
Antoine Pitroue71d5742011-10-04 15:55:09 +020011686 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011687 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11688 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689}
11690
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011691PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011692 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011693\n\
11694Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011695such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696arguments start and end are interpreted as in slice notation.\n\
11697\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011698Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
11700static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011703 /* initialize variables to prevent gcc warning */
11704 PyObject *substring = NULL;
11705 Py_ssize_t start = 0;
11706 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011707 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011709 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011711
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011712 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011713 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011714
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011715 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011716
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011717 if (result == -2)
11718 return NULL;
11719
Christian Heimes217cfd12007-12-02 14:31:20 +000011720 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011721}
11722
11723static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011724unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011725{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011726 void *data;
11727 enum PyUnicode_Kind kind;
11728 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011729
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011730 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011731 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011732 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011733 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011734 if (PyUnicode_READY(self) == -1) {
11735 return NULL;
11736 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011737 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11738 PyErr_SetString(PyExc_IndexError, "string index out of range");
11739 return NULL;
11740 }
11741 kind = PyUnicode_KIND(self);
11742 data = PyUnicode_DATA(self);
11743 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011744 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011745}
11746
Guido van Rossumc2504932007-09-18 19:42:40 +000011747/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011748 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011749static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011750unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011752 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011753
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011754#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011755 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011756#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (_PyUnicode_HASH(self) != -1)
11758 return _PyUnicode_HASH(self);
11759 if (PyUnicode_READY(self) == -1)
11760 return -1;
animalizea1d14252019-01-02 20:16:06 +080011761
Christian Heimes985ecdc2013-11-20 11:46:18 +010011762 x = _Py_HashBytes(PyUnicode_DATA(self),
11763 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011765 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011766}
11767
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011768PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011769 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770\n\
oldkaa0735f2018-02-02 16:52:55 +080011771Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011772such that sub is contained within S[start:end]. Optional\n\
11773arguments start and end are interpreted as in slice notation.\n\
11774\n\
11775Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776
11777static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011778unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011779{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011780 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011781 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011782 PyObject *substring = NULL;
11783 Py_ssize_t start = 0;
11784 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011785
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011786 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011787 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011789 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011790 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011791
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011792 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 if (result == -2)
11795 return NULL;
11796
Guido van Rossumd57fd912000-03-10 22:53:23 +000011797 if (result < 0) {
11798 PyErr_SetString(PyExc_ValueError, "substring not found");
11799 return NULL;
11800 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011801
Christian Heimes217cfd12007-12-02 14:31:20 +000011802 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011803}
11804
INADA Naoki3ae20562017-01-16 20:41:20 +090011805/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011806str.isascii as unicode_isascii
11807
11808Return True if all characters in the string are ASCII, False otherwise.
11809
11810ASCII characters have code points in the range U+0000-U+007F.
11811Empty string is ASCII too.
11812[clinic start generated code]*/
11813
11814static PyObject *
11815unicode_isascii_impl(PyObject *self)
11816/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11817{
11818 if (PyUnicode_READY(self) == -1) {
11819 return NULL;
11820 }
11821 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11822}
11823
11824/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011825str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826
INADA Naoki3ae20562017-01-16 20:41:20 +090011827Return True if the string is a lowercase string, False otherwise.
11828
11829A string is lowercase if all cased characters in the string are lowercase and
11830there is at least one cased character in the string.
11831[clinic start generated code]*/
11832
11833static PyObject *
11834unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011835/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011837 Py_ssize_t i, length;
11838 int kind;
11839 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011840 int cased;
11841
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 if (PyUnicode_READY(self) == -1)
11843 return NULL;
11844 length = PyUnicode_GET_LENGTH(self);
11845 kind = PyUnicode_KIND(self);
11846 data = PyUnicode_DATA(self);
11847
Guido van Rossumd57fd912000-03-10 22:53:23 +000011848 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011849 if (length == 1)
11850 return PyBool_FromLong(
11851 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011852
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011853 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011855 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011856
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 for (i = 0; i < length; i++) {
11859 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011860
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011862 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011863 else if (!cased && Py_UNICODE_ISLOWER(ch))
11864 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011866 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011867}
11868
INADA Naoki3ae20562017-01-16 20:41:20 +090011869/*[clinic input]
11870str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011871
INADA Naoki3ae20562017-01-16 20:41:20 +090011872Return True if the string is an uppercase string, False otherwise.
11873
11874A string is uppercase if all cased characters in the string are uppercase and
11875there is at least one cased character in the string.
11876[clinic start generated code]*/
11877
11878static PyObject *
11879unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011880/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011882 Py_ssize_t i, length;
11883 int kind;
11884 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885 int cased;
11886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011887 if (PyUnicode_READY(self) == -1)
11888 return NULL;
11889 length = PyUnicode_GET_LENGTH(self);
11890 kind = PyUnicode_KIND(self);
11891 data = PyUnicode_DATA(self);
11892
Guido van Rossumd57fd912000-03-10 22:53:23 +000011893 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011894 if (length == 1)
11895 return PyBool_FromLong(
11896 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011898 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011899 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011900 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011901
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 for (i = 0; i < length; i++) {
11904 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011905
Benjamin Peterson29060642009-01-31 22:14:21 +000011906 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011907 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011908 else if (!cased && Py_UNICODE_ISUPPER(ch))
11909 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011911 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912}
11913
INADA Naoki3ae20562017-01-16 20:41:20 +090011914/*[clinic input]
11915str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916
INADA Naoki3ae20562017-01-16 20:41:20 +090011917Return True if the string is a title-cased string, False otherwise.
11918
11919In a title-cased string, upper- and title-case characters may only
11920follow uncased characters and lowercase characters only cased ones.
11921[clinic start generated code]*/
11922
11923static PyObject *
11924unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011925/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011926{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011927 Py_ssize_t i, length;
11928 int kind;
11929 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011930 int cased, previous_is_cased;
11931
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 if (PyUnicode_READY(self) == -1)
11933 return NULL;
11934 length = PyUnicode_GET_LENGTH(self);
11935 kind = PyUnicode_KIND(self);
11936 data = PyUnicode_DATA(self);
11937
Guido van Rossumd57fd912000-03-10 22:53:23 +000011938 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011939 if (length == 1) {
11940 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11941 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11942 (Py_UNICODE_ISUPPER(ch) != 0));
11943 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011944
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011945 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011947 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011948
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 cased = 0;
11950 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 for (i = 0; i < length; i++) {
11952 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011953
Benjamin Peterson29060642009-01-31 22:14:21 +000011954 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11955 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011956 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011957 previous_is_cased = 1;
11958 cased = 1;
11959 }
11960 else if (Py_UNICODE_ISLOWER(ch)) {
11961 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011962 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011963 previous_is_cased = 1;
11964 cased = 1;
11965 }
11966 else
11967 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011968 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011969 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011970}
11971
INADA Naoki3ae20562017-01-16 20:41:20 +090011972/*[clinic input]
11973str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011974
INADA Naoki3ae20562017-01-16 20:41:20 +090011975Return True if the string is a whitespace string, False otherwise.
11976
11977A string is whitespace if all characters in the string are whitespace and there
11978is at least one character in the string.
11979[clinic start generated code]*/
11980
11981static PyObject *
11982unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011983/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 Py_ssize_t i, length;
11986 int kind;
11987 void *data;
11988
11989 if (PyUnicode_READY(self) == -1)
11990 return NULL;
11991 length = PyUnicode_GET_LENGTH(self);
11992 kind = PyUnicode_KIND(self);
11993 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011994
Guido van Rossumd57fd912000-03-10 22:53:23 +000011995 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 if (length == 1)
11997 return PyBool_FromLong(
11998 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011999
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012000 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012001 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012002 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012003
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012004 for (i = 0; i < length; i++) {
12005 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012006 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012007 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012009 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010}
12011
INADA Naoki3ae20562017-01-16 20:41:20 +090012012/*[clinic input]
12013str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012014
INADA Naoki3ae20562017-01-16 20:41:20 +090012015Return True if the string is an alphabetic string, False otherwise.
12016
12017A string is alphabetic if all characters in the string are alphabetic and there
12018is at least one character in the string.
12019[clinic start generated code]*/
12020
12021static PyObject *
12022unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012023/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012024{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012025 Py_ssize_t i, length;
12026 int kind;
12027 void *data;
12028
12029 if (PyUnicode_READY(self) == -1)
12030 return NULL;
12031 length = PyUnicode_GET_LENGTH(self);
12032 kind = PyUnicode_KIND(self);
12033 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012034
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012035 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 if (length == 1)
12037 return PyBool_FromLong(
12038 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012039
12040 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012041 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012042 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012044 for (i = 0; i < length; i++) {
12045 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012046 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012047 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012048 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012049}
12050
INADA Naoki3ae20562017-01-16 20:41:20 +090012051/*[clinic input]
12052str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012053
INADA Naoki3ae20562017-01-16 20:41:20 +090012054Return True if the string is an alpha-numeric string, False otherwise.
12055
12056A string is alpha-numeric if all characters in the string are alpha-numeric and
12057there is at least one character in the string.
12058[clinic start generated code]*/
12059
12060static PyObject *
12061unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012062/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 int kind;
12065 void *data;
12066 Py_ssize_t len, i;
12067
12068 if (PyUnicode_READY(self) == -1)
12069 return NULL;
12070
12071 kind = PyUnicode_KIND(self);
12072 data = PyUnicode_DATA(self);
12073 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012074
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012075 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 if (len == 1) {
12077 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12078 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12079 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012080
12081 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012082 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012083 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012085 for (i = 0; i < len; i++) {
12086 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012087 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012088 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012089 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012090 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012091}
12092
INADA Naoki3ae20562017-01-16 20:41:20 +090012093/*[clinic input]
12094str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
INADA Naoki3ae20562017-01-16 20:41:20 +090012096Return True if the string is a decimal string, False otherwise.
12097
12098A string is a decimal string if all characters in the string are decimal and
12099there is at least one character in the string.
12100[clinic start generated code]*/
12101
12102static PyObject *
12103unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012104/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 Py_ssize_t i, length;
12107 int kind;
12108 void *data;
12109
12110 if (PyUnicode_READY(self) == -1)
12111 return NULL;
12112 length = PyUnicode_GET_LENGTH(self);
12113 kind = PyUnicode_KIND(self);
12114 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 if (length == 1)
12118 return PyBool_FromLong(
12119 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012121 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012123 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 for (i = 0; i < length; i++) {
12126 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012127 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012129 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130}
12131
INADA Naoki3ae20562017-01-16 20:41:20 +090012132/*[clinic input]
12133str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
INADA Naoki3ae20562017-01-16 20:41:20 +090012135Return True if the string is a digit string, False otherwise.
12136
12137A string is a digit string if all characters in the string are digits and there
12138is at least one character in the string.
12139[clinic start generated code]*/
12140
12141static PyObject *
12142unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012143/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 Py_ssize_t i, length;
12146 int kind;
12147 void *data;
12148
12149 if (PyUnicode_READY(self) == -1)
12150 return NULL;
12151 length = PyUnicode_GET_LENGTH(self);
12152 kind = PyUnicode_KIND(self);
12153 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (length == 1) {
12157 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12158 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12159 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012160
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012161 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012162 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012163 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012164
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012165 for (i = 0; i < length; i++) {
12166 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012167 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012168 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012169 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012170}
12171
INADA Naoki3ae20562017-01-16 20:41:20 +090012172/*[clinic input]
12173str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012174
INADA Naoki3ae20562017-01-16 20:41:20 +090012175Return True if the string is a numeric string, False otherwise.
12176
12177A string is numeric if all characters in the string are numeric and there is at
12178least one character in the string.
12179[clinic start generated code]*/
12180
12181static PyObject *
12182unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012183/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012184{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012185 Py_ssize_t i, length;
12186 int kind;
12187 void *data;
12188
12189 if (PyUnicode_READY(self) == -1)
12190 return NULL;
12191 length = PyUnicode_GET_LENGTH(self);
12192 kind = PyUnicode_KIND(self);
12193 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194
Guido van Rossumd57fd912000-03-10 22:53:23 +000012195 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012196 if (length == 1)
12197 return PyBool_FromLong(
12198 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012200 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012202 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 for (i = 0; i < length; i++) {
12205 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012206 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012208 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209}
12210
Martin v. Löwis47383402007-08-15 07:32:56 +000012211int
12212PyUnicode_IsIdentifier(PyObject *self)
12213{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012214 int kind;
12215 void *data;
12216 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012217 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012218
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012219 if (PyUnicode_READY(self) == -1) {
12220 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012221 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012222 }
12223
12224 /* Special case for empty strings */
12225 if (PyUnicode_GET_LENGTH(self) == 0)
12226 return 0;
12227 kind = PyUnicode_KIND(self);
12228 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012229
12230 /* PEP 3131 says that the first character must be in
12231 XID_Start and subsequent characters in XID_Continue,
12232 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012233 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012234 letters, digits, underscore). However, given the current
12235 definition of XID_Start and XID_Continue, it is sufficient
12236 to check just for these, except that _ must be allowed
12237 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012238 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012239 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012240 return 0;
12241
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012242 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012244 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012245 return 1;
12246}
12247
INADA Naoki3ae20562017-01-16 20:41:20 +090012248/*[clinic input]
12249str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012250
INADA Naoki3ae20562017-01-16 20:41:20 +090012251Return True if the string is a valid Python identifier, False otherwise.
12252
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012253Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012254such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012255[clinic start generated code]*/
12256
12257static PyObject *
12258unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012259/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012260{
12261 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12262}
12263
INADA Naoki3ae20562017-01-16 20:41:20 +090012264/*[clinic input]
12265str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012266
INADA Naoki3ae20562017-01-16 20:41:20 +090012267Return True if the string is printable, False otherwise.
12268
12269A string is printable if all of its characters are considered printable in
12270repr() or if it is empty.
12271[clinic start generated code]*/
12272
12273static PyObject *
12274unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012275/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012276{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012277 Py_ssize_t i, length;
12278 int kind;
12279 void *data;
12280
12281 if (PyUnicode_READY(self) == -1)
12282 return NULL;
12283 length = PyUnicode_GET_LENGTH(self);
12284 kind = PyUnicode_KIND(self);
12285 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012286
12287 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012288 if (length == 1)
12289 return PyBool_FromLong(
12290 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012291
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012292 for (i = 0; i < length; i++) {
12293 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012294 Py_RETURN_FALSE;
12295 }
12296 }
12297 Py_RETURN_TRUE;
12298}
12299
INADA Naoki3ae20562017-01-16 20:41:20 +090012300/*[clinic input]
12301str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012302
INADA Naoki3ae20562017-01-16 20:41:20 +090012303 iterable: object
12304 /
12305
12306Concatenate any number of strings.
12307
Martin Panter91a88662017-01-24 00:30:06 +000012308The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012309The result is returned as a new string.
12310
12311Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12312[clinic start generated code]*/
12313
12314static PyObject *
12315unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012316/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012317{
INADA Naoki3ae20562017-01-16 20:41:20 +090012318 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012319}
12320
Martin v. Löwis18e16552006-02-15 17:27:45 +000012321static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012322unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012323{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012324 if (PyUnicode_READY(self) == -1)
12325 return -1;
12326 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012327}
12328
INADA Naoki3ae20562017-01-16 20:41:20 +090012329/*[clinic input]
12330str.ljust as unicode_ljust
12331
12332 width: Py_ssize_t
12333 fillchar: Py_UCS4 = ' '
12334 /
12335
12336Return a left-justified string of length width.
12337
12338Padding is done using the specified fill character (default is a space).
12339[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012340
12341static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012342unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12343/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012344{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012345 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012347
Victor Stinnerc4b49542011-12-11 22:44:26 +010012348 if (PyUnicode_GET_LENGTH(self) >= width)
12349 return unicode_result_unchanged(self);
12350
12351 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012352}
12353
INADA Naoki3ae20562017-01-16 20:41:20 +090012354/*[clinic input]
12355str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012356
INADA Naoki3ae20562017-01-16 20:41:20 +090012357Return a copy of the string converted to lowercase.
12358[clinic start generated code]*/
12359
12360static PyObject *
12361unicode_lower_impl(PyObject *self)
12362/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012363{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012364 if (PyUnicode_READY(self) == -1)
12365 return NULL;
12366 if (PyUnicode_IS_ASCII(self))
12367 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012368 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012369}
12370
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012371#define LEFTSTRIP 0
12372#define RIGHTSTRIP 1
12373#define BOTHSTRIP 2
12374
12375/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012376static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012377
INADA Naoki3ae20562017-01-16 20:41:20 +090012378#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012379
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012380/* externally visible for str.strip(unicode) */
12381PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012382_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 void *data;
12385 int kind;
12386 Py_ssize_t i, j, len;
12387 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012388 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012389
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012390 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12391 return NULL;
12392
12393 kind = PyUnicode_KIND(self);
12394 data = PyUnicode_DATA(self);
12395 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012396 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12398 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012399 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012400
Benjamin Peterson14339b62009-01-31 16:36:08 +000012401 i = 0;
12402 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012403 while (i < len) {
12404 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12405 if (!BLOOM(sepmask, ch))
12406 break;
12407 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12408 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012409 i++;
12410 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012411 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012412
Benjamin Peterson14339b62009-01-31 16:36:08 +000012413 j = len;
12414 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012415 j--;
12416 while (j >= i) {
12417 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12418 if (!BLOOM(sepmask, ch))
12419 break;
12420 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12421 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012422 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012423 }
12424
Benjamin Peterson29060642009-01-31 22:14:21 +000012425 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012426 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012427
Victor Stinner7931d9a2011-11-04 00:22:48 +010012428 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012429}
12430
12431PyObject*
12432PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12433{
12434 unsigned char *data;
12435 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012436 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437
Victor Stinnerde636f32011-10-01 03:55:54 +020012438 if (PyUnicode_READY(self) == -1)
12439 return NULL;
12440
Victor Stinner684d5fd2012-05-03 02:32:34 +020012441 length = PyUnicode_GET_LENGTH(self);
12442 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012443
Victor Stinner684d5fd2012-05-03 02:32:34 +020012444 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012445 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012446
Victor Stinnerde636f32011-10-01 03:55:54 +020012447 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012448 PyErr_SetString(PyExc_IndexError, "string index out of range");
12449 return NULL;
12450 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012451 if (start >= length || end < start)
12452 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012453
Victor Stinner684d5fd2012-05-03 02:32:34 +020012454 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012455 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012456 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012457 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012458 }
12459 else {
12460 kind = PyUnicode_KIND(self);
12461 data = PyUnicode_1BYTE_DATA(self);
12462 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012463 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012464 length);
12465 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012466}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012467
12468static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012469do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012470{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012471 Py_ssize_t len, i, j;
12472
12473 if (PyUnicode_READY(self) == -1)
12474 return NULL;
12475
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012476 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012477
Victor Stinnercc7af722013-04-09 22:39:24 +020012478 if (PyUnicode_IS_ASCII(self)) {
12479 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12480
12481 i = 0;
12482 if (striptype != RIGHTSTRIP) {
12483 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012484 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012485 if (!_Py_ascii_whitespace[ch])
12486 break;
12487 i++;
12488 }
12489 }
12490
12491 j = len;
12492 if (striptype != LEFTSTRIP) {
12493 j--;
12494 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012495 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012496 if (!_Py_ascii_whitespace[ch])
12497 break;
12498 j--;
12499 }
12500 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012501 }
12502 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012503 else {
12504 int kind = PyUnicode_KIND(self);
12505 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012506
Victor Stinnercc7af722013-04-09 22:39:24 +020012507 i = 0;
12508 if (striptype != RIGHTSTRIP) {
12509 while (i < len) {
12510 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12511 if (!Py_UNICODE_ISSPACE(ch))
12512 break;
12513 i++;
12514 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012515 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012516
12517 j = len;
12518 if (striptype != LEFTSTRIP) {
12519 j--;
12520 while (j >= i) {
12521 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12522 if (!Py_UNICODE_ISSPACE(ch))
12523 break;
12524 j--;
12525 }
12526 j++;
12527 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012528 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012529
Victor Stinner7931d9a2011-11-04 00:22:48 +010012530 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012531}
12532
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012533
12534static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012535do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012536{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012537 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012538 if (PyUnicode_Check(sep))
12539 return _PyUnicode_XStrip(self, striptype, sep);
12540 else {
12541 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012542 "%s arg must be None or str",
12543 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012544 return NULL;
12545 }
12546 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012547
Benjamin Peterson14339b62009-01-31 16:36:08 +000012548 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012549}
12550
12551
INADA Naoki3ae20562017-01-16 20:41:20 +090012552/*[clinic input]
12553str.strip as unicode_strip
12554
12555 chars: object = None
12556 /
12557
Zachary Ware09895c22019-10-09 16:09:00 -050012558Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012559
12560If chars is given and not None, remove characters in chars instead.
12561[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012562
12563static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012564unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012565/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012566{
INADA Naoki3ae20562017-01-16 20:41:20 +090012567 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012568}
12569
12570
INADA Naoki3ae20562017-01-16 20:41:20 +090012571/*[clinic input]
12572str.lstrip as unicode_lstrip
12573
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012574 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012575 /
12576
12577Return a copy of the string with leading whitespace removed.
12578
12579If chars is given and not None, remove characters in chars instead.
12580[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012581
12582static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012583unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012584/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012585{
INADA Naoki3ae20562017-01-16 20:41:20 +090012586 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012587}
12588
12589
INADA Naoki3ae20562017-01-16 20:41:20 +090012590/*[clinic input]
12591str.rstrip as unicode_rstrip
12592
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012593 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012594 /
12595
12596Return a copy of the string with trailing whitespace removed.
12597
12598If chars is given and not None, remove characters in chars instead.
12599[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012600
12601static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012602unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012603/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012604{
INADA Naoki3ae20562017-01-16 20:41:20 +090012605 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012606}
12607
12608
Guido van Rossumd57fd912000-03-10 22:53:23 +000012609static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012610unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012611{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012612 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012613 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012614
Serhiy Storchaka05997252013-01-26 12:14:02 +020012615 if (len < 1)
12616 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012617
Victor Stinnerc4b49542011-12-11 22:44:26 +010012618 /* no repeat, return original string */
12619 if (len == 1)
12620 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012621
Benjamin Petersonbac79492012-01-14 13:34:47 -050012622 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012623 return NULL;
12624
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012625 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012626 PyErr_SetString(PyExc_OverflowError,
12627 "repeated string is too long");
12628 return NULL;
12629 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012631
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012632 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012633 if (!u)
12634 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012635 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 if (PyUnicode_GET_LENGTH(str) == 1) {
12638 const int kind = PyUnicode_KIND(str);
12639 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012640 if (kind == PyUnicode_1BYTE_KIND) {
12641 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012642 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012643 }
12644 else if (kind == PyUnicode_2BYTE_KIND) {
12645 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012646 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012647 ucs2[n] = fill_char;
12648 } else {
12649 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12650 assert(kind == PyUnicode_4BYTE_KIND);
12651 for (n = 0; n < len; ++n)
12652 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012653 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 }
12655 else {
12656 /* number of characters copied this far */
12657 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012658 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012660 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012662 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012664 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012665 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667 }
12668
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012669 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012670 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012671}
12672
Alexander Belopolsky40018472011-02-26 01:02:56 +000012673PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012674PyUnicode_Replace(PyObject *str,
12675 PyObject *substr,
12676 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012677 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012678{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012679 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12680 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012681 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012682 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012683}
12684
INADA Naoki3ae20562017-01-16 20:41:20 +090012685/*[clinic input]
12686str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012687
INADA Naoki3ae20562017-01-16 20:41:20 +090012688 old: unicode
12689 new: unicode
12690 count: Py_ssize_t = -1
12691 Maximum number of occurrences to replace.
12692 -1 (the default value) means replace all occurrences.
12693 /
12694
12695Return a copy with all occurrences of substring old replaced by new.
12696
12697If the optional argument count is given, only the first count occurrences are
12698replaced.
12699[clinic start generated code]*/
12700
12701static PyObject *
12702unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12703 Py_ssize_t count)
12704/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012705{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012706 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012707 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012708 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012709}
12710
Alexander Belopolsky40018472011-02-26 01:02:56 +000012711static PyObject *
12712unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012713{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012714 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715 Py_ssize_t isize;
12716 Py_ssize_t osize, squote, dquote, i, o;
12717 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012718 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012720
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012721 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012722 return NULL;
12723
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012724 isize = PyUnicode_GET_LENGTH(unicode);
12725 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012727 /* Compute length of output, quote characters, and
12728 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012729 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012730 max = 127;
12731 squote = dquote = 0;
12732 ikind = PyUnicode_KIND(unicode);
12733 for (i = 0; i < isize; i++) {
12734 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012735 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012736 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012737 case '\'': squote++; break;
12738 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012739 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012740 incr = 2;
12741 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012742 default:
12743 /* Fast-path ASCII */
12744 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012745 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012746 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012747 ;
12748 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012749 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012750 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012751 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012752 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012753 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012754 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012755 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012756 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012757 if (osize > PY_SSIZE_T_MAX - incr) {
12758 PyErr_SetString(PyExc_OverflowError,
12759 "string is too long to generate repr");
12760 return NULL;
12761 }
12762 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012763 }
12764
12765 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012766 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012767 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012768 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012769 if (dquote)
12770 /* Both squote and dquote present. Use squote,
12771 and escape them */
12772 osize += squote;
12773 else
12774 quote = '"';
12775 }
Victor Stinner55c08782013-04-14 18:45:39 +020012776 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777
12778 repr = PyUnicode_New(osize, max);
12779 if (repr == NULL)
12780 return NULL;
12781 okind = PyUnicode_KIND(repr);
12782 odata = PyUnicode_DATA(repr);
12783
12784 PyUnicode_WRITE(okind, odata, 0, quote);
12785 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012786 if (unchanged) {
12787 _PyUnicode_FastCopyCharacters(repr, 1,
12788 unicode, 0,
12789 isize);
12790 }
12791 else {
12792 for (i = 0, o = 1; i < isize; i++) {
12793 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012794
Victor Stinner55c08782013-04-14 18:45:39 +020012795 /* Escape quotes and backslashes */
12796 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012797 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012799 continue;
12800 }
12801
12802 /* Map special whitespace to '\t', \n', '\r' */
12803 if (ch == '\t') {
12804 PyUnicode_WRITE(okind, odata, o++, '\\');
12805 PyUnicode_WRITE(okind, odata, o++, 't');
12806 }
12807 else if (ch == '\n') {
12808 PyUnicode_WRITE(okind, odata, o++, '\\');
12809 PyUnicode_WRITE(okind, odata, o++, 'n');
12810 }
12811 else if (ch == '\r') {
12812 PyUnicode_WRITE(okind, odata, o++, '\\');
12813 PyUnicode_WRITE(okind, odata, o++, 'r');
12814 }
12815
12816 /* Map non-printable US ASCII to '\xhh' */
12817 else if (ch < ' ' || ch == 0x7F) {
12818 PyUnicode_WRITE(okind, odata, o++, '\\');
12819 PyUnicode_WRITE(okind, odata, o++, 'x');
12820 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12821 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12822 }
12823
12824 /* Copy ASCII characters as-is */
12825 else if (ch < 0x7F) {
12826 PyUnicode_WRITE(okind, odata, o++, ch);
12827 }
12828
12829 /* Non-ASCII characters */
12830 else {
12831 /* Map Unicode whitespace and control characters
12832 (categories Z* and C* except ASCII space)
12833 */
12834 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12835 PyUnicode_WRITE(okind, odata, o++, '\\');
12836 /* Map 8-bit characters to '\xhh' */
12837 if (ch <= 0xff) {
12838 PyUnicode_WRITE(okind, odata, o++, 'x');
12839 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12840 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12841 }
12842 /* Map 16-bit characters to '\uxxxx' */
12843 else if (ch <= 0xffff) {
12844 PyUnicode_WRITE(okind, odata, o++, 'u');
12845 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12846 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12847 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12848 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12849 }
12850 /* Map 21-bit characters to '\U00xxxxxx' */
12851 else {
12852 PyUnicode_WRITE(okind, odata, o++, 'U');
12853 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12854 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12855 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12856 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12857 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12858 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12859 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12860 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12861 }
12862 }
12863 /* Copy characters as-is */
12864 else {
12865 PyUnicode_WRITE(okind, odata, o++, ch);
12866 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012867 }
12868 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012869 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012870 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012871 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012872 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012873}
12874
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012875PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012876 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012877\n\
12878Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012879such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880arguments start and end are interpreted as in slice notation.\n\
12881\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012882Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883
12884static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012885unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012887 /* initialize variables to prevent gcc warning */
12888 PyObject *substring = NULL;
12889 Py_ssize_t start = 0;
12890 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012891 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012893 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012894 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012896 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012897 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012898
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012899 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012900
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012901 if (result == -2)
12902 return NULL;
12903
Christian Heimes217cfd12007-12-02 14:31:20 +000012904 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012905}
12906
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012907PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012908 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012909\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012910Return the highest index in S where substring sub is found,\n\
12911such that sub is contained within S[start:end]. Optional\n\
12912arguments start and end are interpreted as in slice notation.\n\
12913\n\
12914Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012915
12916static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012917unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012919 /* initialize variables to prevent gcc warning */
12920 PyObject *substring = NULL;
12921 Py_ssize_t start = 0;
12922 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012924
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012925 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012926 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012927
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012928 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012929 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012931 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012932
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 if (result == -2)
12934 return NULL;
12935
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936 if (result < 0) {
12937 PyErr_SetString(PyExc_ValueError, "substring not found");
12938 return NULL;
12939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940
Christian Heimes217cfd12007-12-02 14:31:20 +000012941 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012942}
12943
INADA Naoki3ae20562017-01-16 20:41:20 +090012944/*[clinic input]
12945str.rjust as unicode_rjust
12946
12947 width: Py_ssize_t
12948 fillchar: Py_UCS4 = ' '
12949 /
12950
12951Return a right-justified string of length width.
12952
12953Padding is done using the specified fill character (default is a space).
12954[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012955
12956static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012957unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12958/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012959{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012960 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012961 return NULL;
12962
Victor Stinnerc4b49542011-12-11 22:44:26 +010012963 if (PyUnicode_GET_LENGTH(self) >= width)
12964 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012965
Victor Stinnerc4b49542011-12-11 22:44:26 +010012966 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012967}
12968
Alexander Belopolsky40018472011-02-26 01:02:56 +000012969PyObject *
12970PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012972 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012973 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012974
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012975 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012976}
12977
INADA Naoki3ae20562017-01-16 20:41:20 +090012978/*[clinic input]
12979str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980
INADA Naoki3ae20562017-01-16 20:41:20 +090012981 sep: object = None
12982 The delimiter according which to split the string.
12983 None (the default value) means split according to any whitespace,
12984 and discard empty strings from the result.
12985 maxsplit: Py_ssize_t = -1
12986 Maximum number of splits to do.
12987 -1 (the default value) means no limit.
12988
12989Return a list of the words in the string, using sep as the delimiter string.
12990[clinic start generated code]*/
12991
12992static PyObject *
12993unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12994/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995{
INADA Naoki3ae20562017-01-16 20:41:20 +090012996 if (sep == Py_None)
12997 return split(self, NULL, maxsplit);
12998 if (PyUnicode_Check(sep))
12999 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013000
Victor Stinner998b8062018-09-12 00:23:25 +020013001 PyErr_Format(PyExc_TypeError,
13002 "must be str or None, not %.100s",
13003 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013004 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013005}
13006
Thomas Wouters477c8d52006-05-27 19:21:47 +000013007PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013008PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013009{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013010 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013011 int kind1, kind2;
13012 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013014
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013015 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013016 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013017
Victor Stinner14f8f022011-10-05 20:58:25 +020013018 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013020 len1 = PyUnicode_GET_LENGTH(str_obj);
13021 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013022 if (kind1 < kind2 || len1 < len2) {
13023 _Py_INCREF_UNICODE_EMPTY();
13024 if (!unicode_empty)
13025 out = NULL;
13026 else {
13027 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13028 Py_DECREF(unicode_empty);
13029 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013030 return out;
13031 }
13032 buf1 = PyUnicode_DATA(str_obj);
13033 buf2 = PyUnicode_DATA(sep_obj);
13034 if (kind2 != kind1) {
13035 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13036 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013037 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013038 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013039
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013040 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013041 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013042 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13043 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13044 else
13045 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013046 break;
13047 case PyUnicode_2BYTE_KIND:
13048 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13049 break;
13050 case PyUnicode_4BYTE_KIND:
13051 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13052 break;
13053 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013054 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013055 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013056
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013057 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013058 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013059
13060 return out;
13061}
13062
13063
13064PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013065PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013066{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013067 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013068 int kind1, kind2;
13069 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013070 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013071
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013072 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013073 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013074
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013075 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013076 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013077 len1 = PyUnicode_GET_LENGTH(str_obj);
13078 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013079 if (kind1 < kind2 || len1 < len2) {
13080 _Py_INCREF_UNICODE_EMPTY();
13081 if (!unicode_empty)
13082 out = NULL;
13083 else {
13084 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13085 Py_DECREF(unicode_empty);
13086 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013087 return out;
13088 }
13089 buf1 = PyUnicode_DATA(str_obj);
13090 buf2 = PyUnicode_DATA(sep_obj);
13091 if (kind2 != kind1) {
13092 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13093 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013094 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013095 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013096
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013097 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013098 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013099 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13100 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13101 else
13102 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013103 break;
13104 case PyUnicode_2BYTE_KIND:
13105 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13106 break;
13107 case PyUnicode_4BYTE_KIND:
13108 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13109 break;
13110 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013111 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013112 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013113
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013114 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013115 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013116
13117 return out;
13118}
13119
INADA Naoki3ae20562017-01-16 20:41:20 +090013120/*[clinic input]
13121str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013122
INADA Naoki3ae20562017-01-16 20:41:20 +090013123 sep: object
13124 /
13125
13126Partition the string into three parts using the given separator.
13127
13128This will search for the separator in the string. If the separator is found,
13129returns a 3-tuple containing the part before the separator, the separator
13130itself, and the part after it.
13131
13132If the separator is not found, returns a 3-tuple containing the original string
13133and two empty strings.
13134[clinic start generated code]*/
13135
13136static PyObject *
13137unicode_partition(PyObject *self, PyObject *sep)
13138/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013139{
INADA Naoki3ae20562017-01-16 20:41:20 +090013140 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013141}
13142
INADA Naoki3ae20562017-01-16 20:41:20 +090013143/*[clinic input]
13144str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013145
INADA Naoki3ae20562017-01-16 20:41:20 +090013146Partition the string into three parts using the given separator.
13147
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013148This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013149the separator is found, returns a 3-tuple containing the part before the
13150separator, the separator itself, and the part after it.
13151
13152If the separator is not found, returns a 3-tuple containing two empty strings
13153and the original string.
13154[clinic start generated code]*/
13155
13156static PyObject *
13157unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013158/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013159{
INADA Naoki3ae20562017-01-16 20:41:20 +090013160 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013161}
13162
Alexander Belopolsky40018472011-02-26 01:02:56 +000013163PyObject *
13164PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013165{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013166 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013167 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013168
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013169 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013170}
13171
INADA Naoki3ae20562017-01-16 20:41:20 +090013172/*[clinic input]
13173str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013174
INADA Naoki3ae20562017-01-16 20:41:20 +090013175Return a list of the words in the string, using sep as the delimiter string.
13176
13177Splits are done starting at the end of the string and working to the front.
13178[clinic start generated code]*/
13179
13180static PyObject *
13181unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13182/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013183{
INADA Naoki3ae20562017-01-16 20:41:20 +090013184 if (sep == Py_None)
13185 return rsplit(self, NULL, maxsplit);
13186 if (PyUnicode_Check(sep))
13187 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013188
Victor Stinner998b8062018-09-12 00:23:25 +020013189 PyErr_Format(PyExc_TypeError,
13190 "must be str or None, not %.100s",
13191 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013192 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013193}
13194
INADA Naoki3ae20562017-01-16 20:41:20 +090013195/*[clinic input]
13196str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013197
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013198 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013199
13200Return a list of the lines in the string, breaking at line boundaries.
13201
13202Line breaks are not included in the resulting list unless keepends is given and
13203true.
13204[clinic start generated code]*/
13205
13206static PyObject *
13207unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013208/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013209{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013210 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013211}
13212
13213static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013214PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013215{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013216 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013217}
13218
INADA Naoki3ae20562017-01-16 20:41:20 +090013219/*[clinic input]
13220str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013221
INADA Naoki3ae20562017-01-16 20:41:20 +090013222Convert uppercase characters to lowercase and lowercase characters to uppercase.
13223[clinic start generated code]*/
13224
13225static PyObject *
13226unicode_swapcase_impl(PyObject *self)
13227/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013228{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013229 if (PyUnicode_READY(self) == -1)
13230 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013231 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013232}
13233
Larry Hastings61272b72014-01-07 12:41:53 -080013234/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013235
Larry Hastings31826802013-10-19 00:09:25 -070013236@staticmethod
13237str.maketrans as unicode_maketrans
13238
13239 x: object
13240
13241 y: unicode=NULL
13242
13243 z: unicode=NULL
13244
13245 /
13246
13247Return a translation table usable for str.translate().
13248
13249If there is only one argument, it must be a dictionary mapping Unicode
13250ordinals (integers) or characters to Unicode ordinals, strings or None.
13251Character keys will be then converted to ordinals.
13252If there are two arguments, they must be strings of equal length, and
13253in the resulting dictionary, each character in x will be mapped to the
13254character at the same position in y. If there is a third argument, it
13255must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013256[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013257
Larry Hastings31826802013-10-19 00:09:25 -070013258static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013259unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013260/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013261{
Georg Brandlceee0772007-11-27 23:48:05 +000013262 PyObject *new = NULL, *key, *value;
13263 Py_ssize_t i = 0;
13264 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013265
Georg Brandlceee0772007-11-27 23:48:05 +000013266 new = PyDict_New();
13267 if (!new)
13268 return NULL;
13269 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013270 int x_kind, y_kind, z_kind;
13271 void *x_data, *y_data, *z_data;
13272
Georg Brandlceee0772007-11-27 23:48:05 +000013273 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013274 if (!PyUnicode_Check(x)) {
13275 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13276 "be a string if there is a second argument");
13277 goto err;
13278 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013279 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013280 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13281 "arguments must have equal length");
13282 goto err;
13283 }
13284 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013285 x_kind = PyUnicode_KIND(x);
13286 y_kind = PyUnicode_KIND(y);
13287 x_data = PyUnicode_DATA(x);
13288 y_data = PyUnicode_DATA(y);
13289 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13290 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013291 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013292 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013293 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013294 if (!value) {
13295 Py_DECREF(key);
13296 goto err;
13297 }
Georg Brandlceee0772007-11-27 23:48:05 +000013298 res = PyDict_SetItem(new, key, value);
13299 Py_DECREF(key);
13300 Py_DECREF(value);
13301 if (res < 0)
13302 goto err;
13303 }
13304 /* create entries for deleting chars in z */
13305 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013306 z_kind = PyUnicode_KIND(z);
13307 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013308 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013309 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013310 if (!key)
13311 goto err;
13312 res = PyDict_SetItem(new, key, Py_None);
13313 Py_DECREF(key);
13314 if (res < 0)
13315 goto err;
13316 }
13317 }
13318 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013319 int kind;
13320 void *data;
13321
Georg Brandlceee0772007-11-27 23:48:05 +000013322 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013323 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013324 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13325 "to maketrans it must be a dict");
13326 goto err;
13327 }
13328 /* copy entries into the new dict, converting string keys to int keys */
13329 while (PyDict_Next(x, &i, &key, &value)) {
13330 if (PyUnicode_Check(key)) {
13331 /* convert string keys to integer keys */
13332 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013333 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013334 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13335 "table must be of length 1");
13336 goto err;
13337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 kind = PyUnicode_KIND(key);
13339 data = PyUnicode_DATA(key);
13340 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013341 if (!newkey)
13342 goto err;
13343 res = PyDict_SetItem(new, newkey, value);
13344 Py_DECREF(newkey);
13345 if (res < 0)
13346 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013347 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013348 /* just keep integer keys */
13349 if (PyDict_SetItem(new, key, value) < 0)
13350 goto err;
13351 } else {
13352 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13353 "be strings or integers");
13354 goto err;
13355 }
13356 }
13357 }
13358 return new;
13359 err:
13360 Py_DECREF(new);
13361 return NULL;
13362}
13363
INADA Naoki3ae20562017-01-16 20:41:20 +090013364/*[clinic input]
13365str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366
INADA Naoki3ae20562017-01-16 20:41:20 +090013367 table: object
13368 Translation table, which must be a mapping of Unicode ordinals to
13369 Unicode ordinals, strings, or None.
13370 /
13371
13372Replace each character in the string using the given translation table.
13373
13374The table must implement lookup/indexing via __getitem__, for instance a
13375dictionary or list. If this operation raises LookupError, the character is
13376left untouched. Characters mapped to None are deleted.
13377[clinic start generated code]*/
13378
13379static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013380unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013381/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013383 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013384}
13385
INADA Naoki3ae20562017-01-16 20:41:20 +090013386/*[clinic input]
13387str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013388
INADA Naoki3ae20562017-01-16 20:41:20 +090013389Return a copy of the string converted to uppercase.
13390[clinic start generated code]*/
13391
13392static PyObject *
13393unicode_upper_impl(PyObject *self)
13394/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013395{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013396 if (PyUnicode_READY(self) == -1)
13397 return NULL;
13398 if (PyUnicode_IS_ASCII(self))
13399 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013400 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013401}
13402
INADA Naoki3ae20562017-01-16 20:41:20 +090013403/*[clinic input]
13404str.zfill as unicode_zfill
13405
13406 width: Py_ssize_t
13407 /
13408
13409Pad a numeric string with zeros on the left, to fill a field of the given width.
13410
13411The string is never truncated.
13412[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013413
13414static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013415unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013416/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013418 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013419 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013420 int kind;
13421 void *data;
13422 Py_UCS4 chr;
13423
Benjamin Petersonbac79492012-01-14 13:34:47 -050013424 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013425 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426
Victor Stinnerc4b49542011-12-11 22:44:26 +010013427 if (PyUnicode_GET_LENGTH(self) >= width)
13428 return unicode_result_unchanged(self);
13429
13430 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431
13432 u = pad(self, fill, 0, '0');
13433
Walter Dörwald068325e2002-04-15 13:36:47 +000013434 if (u == NULL)
13435 return NULL;
13436
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013437 kind = PyUnicode_KIND(u);
13438 data = PyUnicode_DATA(u);
13439 chr = PyUnicode_READ(kind, data, fill);
13440
13441 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013442 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013443 PyUnicode_WRITE(kind, data, 0, chr);
13444 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445 }
13446
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013447 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013448 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013449}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013450
13451#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013452static PyObject *
13453unicode__decimal2ascii(PyObject *self)
13454{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013455 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013456}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013457#endif
13458
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013459PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013460 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013461\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013462Return True if S starts with the specified prefix, False otherwise.\n\
13463With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013464With optional end, stop comparing S at that position.\n\
13465prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466
13467static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013468unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013469 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013470{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013471 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013472 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013473 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013474 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013475 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476
Jesus Ceaac451502011-04-20 17:09:23 +020013477 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013479 if (PyTuple_Check(subobj)) {
13480 Py_ssize_t i;
13481 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013482 substring = PyTuple_GET_ITEM(subobj, i);
13483 if (!PyUnicode_Check(substring)) {
13484 PyErr_Format(PyExc_TypeError,
13485 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013486 "not %.100s",
13487 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013488 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013489 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013490 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013491 if (result == -1)
13492 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013493 if (result) {
13494 Py_RETURN_TRUE;
13495 }
13496 }
13497 /* nothing matched */
13498 Py_RETURN_FALSE;
13499 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013500 if (!PyUnicode_Check(subobj)) {
13501 PyErr_Format(PyExc_TypeError,
13502 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013503 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013504 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013505 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013506 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013507 if (result == -1)
13508 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013509 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013510}
13511
13512
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013513PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013514 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013515\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013516Return True if S ends with the specified suffix, False otherwise.\n\
13517With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013518With optional end, stop comparing S at that position.\n\
13519suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013520
13521static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013522unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013523 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013524{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013525 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013526 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013527 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013528 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013529 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013530
Jesus Ceaac451502011-04-20 17:09:23 +020013531 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013532 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013533 if (PyTuple_Check(subobj)) {
13534 Py_ssize_t i;
13535 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013536 substring = PyTuple_GET_ITEM(subobj, i);
13537 if (!PyUnicode_Check(substring)) {
13538 PyErr_Format(PyExc_TypeError,
13539 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013540 "not %.100s",
13541 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013542 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013543 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013544 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013545 if (result == -1)
13546 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013547 if (result) {
13548 Py_RETURN_TRUE;
13549 }
13550 }
13551 Py_RETURN_FALSE;
13552 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013553 if (!PyUnicode_Check(subobj)) {
13554 PyErr_Format(PyExc_TypeError,
13555 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013556 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013558 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013559 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013560 if (result == -1)
13561 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013562 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013563}
13564
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013565static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013566_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013567{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013568 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13569 writer->data = PyUnicode_DATA(writer->buffer);
13570
13571 if (!writer->readonly) {
13572 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013573 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013574 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013575 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013576 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13577 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13578 writer->kind = PyUnicode_WCHAR_KIND;
13579 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13580
Victor Stinner8f674cc2013-04-17 23:02:17 +020013581 /* Copy-on-write mode: set buffer size to 0 so
13582 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13583 * next write. */
13584 writer->size = 0;
13585 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013586}
13587
Victor Stinnerd3f08822012-05-29 12:57:52 +020013588void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013589_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013590{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013591 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013592
13593 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013594 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013595
13596 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13597 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13598 writer->kind = PyUnicode_WCHAR_KIND;
13599 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013600}
13601
Inada Naoki770847a2019-06-24 12:30:24 +090013602// Initialize _PyUnicodeWriter with initial buffer
13603static inline void
13604_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13605{
13606 memset(writer, 0, sizeof(*writer));
13607 writer->buffer = buffer;
13608 _PyUnicodeWriter_Update(writer);
13609 writer->min_length = writer->size;
13610}
13611
Victor Stinnerd3f08822012-05-29 12:57:52 +020013612int
13613_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13614 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013615{
13616 Py_ssize_t newlen;
13617 PyObject *newbuffer;
13618
Victor Stinner2740e462016-09-06 16:58:36 -070013619 assert(maxchar <= MAX_UNICODE);
13620
Victor Stinnerca9381e2015-09-22 00:58:32 +020013621 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013622 assert((maxchar > writer->maxchar && length >= 0)
13623 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013624
Victor Stinner202fdca2012-05-07 12:47:02 +020013625 if (length > PY_SSIZE_T_MAX - writer->pos) {
13626 PyErr_NoMemory();
13627 return -1;
13628 }
13629 newlen = writer->pos + length;
13630
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013631 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013632
Victor Stinnerd3f08822012-05-29 12:57:52 +020013633 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013634 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013635 if (writer->overallocate
13636 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13637 /* overallocate to limit the number of realloc() */
13638 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013639 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013640 if (newlen < writer->min_length)
13641 newlen = writer->min_length;
13642
Victor Stinnerd3f08822012-05-29 12:57:52 +020013643 writer->buffer = PyUnicode_New(newlen, maxchar);
13644 if (writer->buffer == NULL)
13645 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013646 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013647 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013648 if (writer->overallocate
13649 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13650 /* overallocate to limit the number of realloc() */
13651 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013652 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013653 if (newlen < writer->min_length)
13654 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013655
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013656 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013657 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013658 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013659 newbuffer = PyUnicode_New(newlen, maxchar);
13660 if (newbuffer == NULL)
13661 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013662 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13663 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013664 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013665 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013666 }
13667 else {
13668 newbuffer = resize_compact(writer->buffer, newlen);
13669 if (newbuffer == NULL)
13670 return -1;
13671 }
13672 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013673 }
13674 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013675 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013676 newbuffer = PyUnicode_New(writer->size, maxchar);
13677 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013678 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013679 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13680 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013681 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013682 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013683 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013684 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013685
13686#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013687}
13688
Victor Stinnerca9381e2015-09-22 00:58:32 +020013689int
13690_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13691 enum PyUnicode_Kind kind)
13692{
13693 Py_UCS4 maxchar;
13694
13695 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13696 assert(writer->kind < kind);
13697
13698 switch (kind)
13699 {
13700 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13701 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13702 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13703 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013704 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013705 }
13706
13707 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13708}
13709
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013710static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013711_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013712{
Victor Stinner2740e462016-09-06 16:58:36 -070013713 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013714 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13715 return -1;
13716 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13717 writer->pos++;
13718 return 0;
13719}
13720
13721int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013722_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13723{
13724 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13725}
13726
13727int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013728_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13729{
13730 Py_UCS4 maxchar;
13731 Py_ssize_t len;
13732
13733 if (PyUnicode_READY(str) == -1)
13734 return -1;
13735 len = PyUnicode_GET_LENGTH(str);
13736 if (len == 0)
13737 return 0;
13738 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13739 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013740 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013741 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013742 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013743 Py_INCREF(str);
13744 writer->buffer = str;
13745 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013746 writer->pos += len;
13747 return 0;
13748 }
13749 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13750 return -1;
13751 }
13752 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13753 str, 0, len);
13754 writer->pos += len;
13755 return 0;
13756}
13757
Victor Stinnere215d962012-10-06 23:03:36 +020013758int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013759_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13760 Py_ssize_t start, Py_ssize_t end)
13761{
13762 Py_UCS4 maxchar;
13763 Py_ssize_t len;
13764
13765 if (PyUnicode_READY(str) == -1)
13766 return -1;
13767
13768 assert(0 <= start);
13769 assert(end <= PyUnicode_GET_LENGTH(str));
13770 assert(start <= end);
13771
13772 if (end == 0)
13773 return 0;
13774
13775 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13776 return _PyUnicodeWriter_WriteStr(writer, str);
13777
13778 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13779 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13780 else
13781 maxchar = writer->maxchar;
13782 len = end - start;
13783
13784 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13785 return -1;
13786
13787 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13788 str, start, len);
13789 writer->pos += len;
13790 return 0;
13791}
13792
13793int
Victor Stinner4a587072013-11-19 12:54:53 +010013794_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13795 const char *ascii, Py_ssize_t len)
13796{
13797 if (len == -1)
13798 len = strlen(ascii);
13799
13800 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13801
13802 if (writer->buffer == NULL && !writer->overallocate) {
13803 PyObject *str;
13804
13805 str = _PyUnicode_FromASCII(ascii, len);
13806 if (str == NULL)
13807 return -1;
13808
13809 writer->readonly = 1;
13810 writer->buffer = str;
13811 _PyUnicodeWriter_Update(writer);
13812 writer->pos += len;
13813 return 0;
13814 }
13815
13816 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13817 return -1;
13818
13819 switch (writer->kind)
13820 {
13821 case PyUnicode_1BYTE_KIND:
13822 {
13823 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13824 Py_UCS1 *data = writer->data;
13825
Christian Heimesf051e432016-09-13 20:22:02 +020013826 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013827 break;
13828 }
13829 case PyUnicode_2BYTE_KIND:
13830 {
13831 _PyUnicode_CONVERT_BYTES(
13832 Py_UCS1, Py_UCS2,
13833 ascii, ascii + len,
13834 (Py_UCS2 *)writer->data + writer->pos);
13835 break;
13836 }
13837 case PyUnicode_4BYTE_KIND:
13838 {
13839 _PyUnicode_CONVERT_BYTES(
13840 Py_UCS1, Py_UCS4,
13841 ascii, ascii + len,
13842 (Py_UCS4 *)writer->data + writer->pos);
13843 break;
13844 }
13845 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013846 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013847 }
13848
13849 writer->pos += len;
13850 return 0;
13851}
13852
13853int
13854_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13855 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013856{
13857 Py_UCS4 maxchar;
13858
13859 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13860 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13861 return -1;
13862 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13863 writer->pos += len;
13864 return 0;
13865}
13866
Victor Stinnerd3f08822012-05-29 12:57:52 +020013867PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013868_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013869{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013870 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013871
Victor Stinnerd3f08822012-05-29 12:57:52 +020013872 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013873 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013874 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013875 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013876
13877 str = writer->buffer;
13878 writer->buffer = NULL;
13879
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013880 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013881 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13882 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013883 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013884
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013885 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13886 PyObject *str2;
13887 str2 = resize_compact(str, writer->pos);
13888 if (str2 == NULL) {
13889 Py_DECREF(str);
13890 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013891 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013892 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013893 }
13894
Victor Stinner15a0bd32013-07-08 22:29:55 +020013895 assert(_PyUnicode_CheckConsistency(str, 1));
13896 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013897}
13898
Victor Stinnerd3f08822012-05-29 12:57:52 +020013899void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013900_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013901{
13902 Py_CLEAR(writer->buffer);
13903}
13904
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013905#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013906
13907PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013908 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013909\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013910Return a formatted version of S, using substitutions from args and kwargs.\n\
13911The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013912
Eric Smith27bbca62010-11-04 17:06:58 +000013913PyDoc_STRVAR(format_map__doc__,
13914 "S.format_map(mapping) -> str\n\
13915\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013916Return a formatted version of S, using substitutions from mapping.\n\
13917The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013918
INADA Naoki3ae20562017-01-16 20:41:20 +090013919/*[clinic input]
13920str.__format__ as unicode___format__
13921
13922 format_spec: unicode
13923 /
13924
13925Return a formatted version of the string as described by format_spec.
13926[clinic start generated code]*/
13927
Eric Smith4a7d76d2008-05-30 18:10:19 +000013928static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013929unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013930/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013931{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013932 _PyUnicodeWriter writer;
13933 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013934
Victor Stinnerd3f08822012-05-29 12:57:52 +020013935 if (PyUnicode_READY(self) == -1)
13936 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013937 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013938 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13939 self, format_spec, 0,
13940 PyUnicode_GET_LENGTH(format_spec));
13941 if (ret == -1) {
13942 _PyUnicodeWriter_Dealloc(&writer);
13943 return NULL;
13944 }
13945 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013946}
13947
INADA Naoki3ae20562017-01-16 20:41:20 +090013948/*[clinic input]
13949str.__sizeof__ as unicode_sizeof
13950
13951Return the size of the string in memory, in bytes.
13952[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013953
13954static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013955unicode_sizeof_impl(PyObject *self)
13956/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013957{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013958 Py_ssize_t size;
13959
13960 /* If it's a compact object, account for base structure +
13961 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013962 if (PyUnicode_IS_COMPACT_ASCII(self))
13963 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13964 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013965 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013966 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013967 else {
13968 /* If it is a two-block object, account for base object, and
13969 for character block if present. */
13970 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013971 if (_PyUnicode_DATA_ANY(self))
13972 size += (PyUnicode_GET_LENGTH(self) + 1) *
13973 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013974 }
13975 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013976 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013977 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13978 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13979 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13980 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013981
13982 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013983}
13984
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013985static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013986unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013987{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013988 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013989 if (!copy)
13990 return NULL;
13991 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013992}
13993
Guido van Rossumd57fd912000-03-10 22:53:23 +000013994static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013995 UNICODE_ENCODE_METHODDEF
13996 UNICODE_REPLACE_METHODDEF
13997 UNICODE_SPLIT_METHODDEF
13998 UNICODE_RSPLIT_METHODDEF
13999 UNICODE_JOIN_METHODDEF
14000 UNICODE_CAPITALIZE_METHODDEF
14001 UNICODE_CASEFOLD_METHODDEF
14002 UNICODE_TITLE_METHODDEF
14003 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014004 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014005 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014006 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014007 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014008 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014009 UNICODE_LJUST_METHODDEF
14010 UNICODE_LOWER_METHODDEF
14011 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014012 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14013 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014014 UNICODE_RJUST_METHODDEF
14015 UNICODE_RSTRIP_METHODDEF
14016 UNICODE_RPARTITION_METHODDEF
14017 UNICODE_SPLITLINES_METHODDEF
14018 UNICODE_STRIP_METHODDEF
14019 UNICODE_SWAPCASE_METHODDEF
14020 UNICODE_TRANSLATE_METHODDEF
14021 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014022 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14023 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014024 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014025 UNICODE_ISLOWER_METHODDEF
14026 UNICODE_ISUPPER_METHODDEF
14027 UNICODE_ISTITLE_METHODDEF
14028 UNICODE_ISSPACE_METHODDEF
14029 UNICODE_ISDECIMAL_METHODDEF
14030 UNICODE_ISDIGIT_METHODDEF
14031 UNICODE_ISNUMERIC_METHODDEF
14032 UNICODE_ISALPHA_METHODDEF
14033 UNICODE_ISALNUM_METHODDEF
14034 UNICODE_ISIDENTIFIER_METHODDEF
14035 UNICODE_ISPRINTABLE_METHODDEF
14036 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014037 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014038 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014039 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014040 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014041 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014042#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014043 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014044 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014045#endif
14046
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014047 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014048 {NULL, NULL}
14049};
14050
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014051static PyObject *
14052unicode_mod(PyObject *v, PyObject *w)
14053{
Brian Curtindfc80e32011-08-10 20:28:54 -050014054 if (!PyUnicode_Check(v))
14055 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014056 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014057}
14058
14059static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014060 0, /*nb_add*/
14061 0, /*nb_subtract*/
14062 0, /*nb_multiply*/
14063 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014064};
14065
Guido van Rossumd57fd912000-03-10 22:53:23 +000014066static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014067 (lenfunc) unicode_length, /* sq_length */
14068 PyUnicode_Concat, /* sq_concat */
14069 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14070 (ssizeargfunc) unicode_getitem, /* sq_item */
14071 0, /* sq_slice */
14072 0, /* sq_ass_item */
14073 0, /* sq_ass_slice */
14074 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014075};
14076
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014077static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014078unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014079{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014080 if (PyUnicode_READY(self) == -1)
14081 return NULL;
14082
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014083 if (PyIndex_Check(item)) {
14084 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014085 if (i == -1 && PyErr_Occurred())
14086 return NULL;
14087 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014088 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014089 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014090 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014091 Py_ssize_t start, stop, step, slicelength, i;
14092 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014093 PyObject *result;
14094 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014095 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014096 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014097
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014098 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014099 return NULL;
14100 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014101 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14102 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014103
14104 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014105 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014106 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014107 slicelength == PyUnicode_GET_LENGTH(self)) {
14108 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014109 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014110 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014111 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014112 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014113 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014114 src_kind = PyUnicode_KIND(self);
14115 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014116 if (!PyUnicode_IS_ASCII(self)) {
14117 kind_limit = kind_maxchar_limit(src_kind);
14118 max_char = 0;
14119 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14120 ch = PyUnicode_READ(src_kind, src_data, cur);
14121 if (ch > max_char) {
14122 max_char = ch;
14123 if (max_char >= kind_limit)
14124 break;
14125 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014126 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014127 }
Victor Stinner55c99112011-10-13 01:17:06 +020014128 else
14129 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014130 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014131 if (result == NULL)
14132 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014133 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014134 dest_data = PyUnicode_DATA(result);
14135
14136 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014137 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14138 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014139 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014140 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014141 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014142 } else {
14143 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14144 return NULL;
14145 }
14146}
14147
14148static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014149 (lenfunc)unicode_length, /* mp_length */
14150 (binaryfunc)unicode_subscript, /* mp_subscript */
14151 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014152};
14153
Guido van Rossumd57fd912000-03-10 22:53:23 +000014154
Guido van Rossumd57fd912000-03-10 22:53:23 +000014155/* Helpers for PyUnicode_Format() */
14156
Victor Stinnera47082312012-10-04 02:19:54 +020014157struct unicode_formatter_t {
14158 PyObject *args;
14159 int args_owned;
14160 Py_ssize_t arglen, argidx;
14161 PyObject *dict;
14162
14163 enum PyUnicode_Kind fmtkind;
14164 Py_ssize_t fmtcnt, fmtpos;
14165 void *fmtdata;
14166 PyObject *fmtstr;
14167
14168 _PyUnicodeWriter writer;
14169};
14170
14171struct unicode_format_arg_t {
14172 Py_UCS4 ch;
14173 int flags;
14174 Py_ssize_t width;
14175 int prec;
14176 int sign;
14177};
14178
Guido van Rossumd57fd912000-03-10 22:53:23 +000014179static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014180unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014181{
Victor Stinnera47082312012-10-04 02:19:54 +020014182 Py_ssize_t argidx = ctx->argidx;
14183
14184 if (argidx < ctx->arglen) {
14185 ctx->argidx++;
14186 if (ctx->arglen < 0)
14187 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014188 else
Victor Stinnera47082312012-10-04 02:19:54 +020014189 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014190 }
14191 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014192 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014193 return NULL;
14194}
14195
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014196/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014197
Victor Stinnera47082312012-10-04 02:19:54 +020014198/* Format a float into the writer if the writer is not NULL, or into *p_output
14199 otherwise.
14200
14201 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014202static int
Victor Stinnera47082312012-10-04 02:19:54 +020014203formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14204 PyObject **p_output,
14205 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014206{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014207 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014208 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014209 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014210 int prec;
14211 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014212
Guido van Rossumd57fd912000-03-10 22:53:23 +000014213 x = PyFloat_AsDouble(v);
14214 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014215 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014216
Victor Stinnera47082312012-10-04 02:19:54 +020014217 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014218 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014219 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014220
Victor Stinnera47082312012-10-04 02:19:54 +020014221 if (arg->flags & F_ALT)
14222 dtoa_flags = Py_DTSF_ALT;
14223 else
14224 dtoa_flags = 0;
14225 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014226 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014227 return -1;
14228 len = strlen(p);
14229 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014230 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014231 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014232 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014233 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014234 }
14235 else
14236 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014237 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014238 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014239}
14240
Victor Stinnerd0880d52012-04-27 23:40:13 +020014241/* formatlong() emulates the format codes d, u, o, x and X, and
14242 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14243 * Python's regular ints.
14244 * Return value: a new PyUnicodeObject*, or NULL if error.
14245 * The output string is of the form
14246 * "-"? ("0x" | "0X")? digit+
14247 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14248 * set in flags. The case of hex digits will be correct,
14249 * There will be at least prec digits, zero-filled on the left if
14250 * necessary to get that many.
14251 * val object to be converted
14252 * flags bitmask of format flags; only F_ALT is looked at
14253 * prec minimum number of digits; 0-fill on left if needed
14254 * type a character in [duoxX]; u acts the same as d
14255 *
14256 * CAUTION: o, x and X conversions on regular ints can never
14257 * produce a '-' sign, but can for Python's unbounded ints.
14258 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014259PyObject *
14260_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014261{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014262 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014263 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014264 Py_ssize_t i;
14265 int sign; /* 1 if '-', else 0 */
14266 int len; /* number of characters */
14267 Py_ssize_t llen;
14268 int numdigits; /* len == numnondigits + numdigits */
14269 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014270
Victor Stinnerd0880d52012-04-27 23:40:13 +020014271 /* Avoid exceeding SSIZE_T_MAX */
14272 if (prec > INT_MAX-3) {
14273 PyErr_SetString(PyExc_OverflowError,
14274 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014275 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014276 }
14277
14278 assert(PyLong_Check(val));
14279
14280 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014282 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014283 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014285 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014286 /* int and int subclasses should print numerically when a numeric */
14287 /* format code is used (see issue18780) */
14288 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014289 break;
14290 case 'o':
14291 numnondigits = 2;
14292 result = PyNumber_ToBase(val, 8);
14293 break;
14294 case 'x':
14295 case 'X':
14296 numnondigits = 2;
14297 result = PyNumber_ToBase(val, 16);
14298 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014299 }
14300 if (!result)
14301 return NULL;
14302
14303 assert(unicode_modifiable(result));
14304 assert(PyUnicode_IS_READY(result));
14305 assert(PyUnicode_IS_ASCII(result));
14306
14307 /* To modify the string in-place, there can only be one reference. */
14308 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014309 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014310 PyErr_BadInternalCall();
14311 return NULL;
14312 }
14313 buf = PyUnicode_DATA(result);
14314 llen = PyUnicode_GET_LENGTH(result);
14315 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014316 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014317 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014318 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014319 return NULL;
14320 }
14321 len = (int)llen;
14322 sign = buf[0] == '-';
14323 numnondigits += sign;
14324 numdigits = len - numnondigits;
14325 assert(numdigits > 0);
14326
14327 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014328 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014329 (type == 'o' || type == 'x' || type == 'X'))) {
14330 assert(buf[sign] == '0');
14331 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14332 buf[sign+1] == 'o');
14333 numnondigits -= 2;
14334 buf += 2;
14335 len -= 2;
14336 if (sign)
14337 buf[0] = '-';
14338 assert(len == numnondigits + numdigits);
14339 assert(numdigits > 0);
14340 }
14341
14342 /* Fill with leading zeroes to meet minimum width. */
14343 if (prec > numdigits) {
14344 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14345 numnondigits + prec);
14346 char *b1;
14347 if (!r1) {
14348 Py_DECREF(result);
14349 return NULL;
14350 }
14351 b1 = PyBytes_AS_STRING(r1);
14352 for (i = 0; i < numnondigits; ++i)
14353 *b1++ = *buf++;
14354 for (i = 0; i < prec - numdigits; i++)
14355 *b1++ = '0';
14356 for (i = 0; i < numdigits; i++)
14357 *b1++ = *buf++;
14358 *b1 = '\0';
14359 Py_DECREF(result);
14360 result = r1;
14361 buf = PyBytes_AS_STRING(result);
14362 len = numnondigits + prec;
14363 }
14364
14365 /* Fix up case for hex conversions. */
14366 if (type == 'X') {
14367 /* Need to convert all lower case letters to upper case.
14368 and need to convert 0x to 0X (and -0x to -0X). */
14369 for (i = 0; i < len; i++)
14370 if (buf[i] >= 'a' && buf[i] <= 'x')
14371 buf[i] -= 'a'-'A';
14372 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014373 if (!PyUnicode_Check(result)
14374 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014375 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014376 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014377 Py_DECREF(result);
14378 result = unicode;
14379 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014380 else if (len != PyUnicode_GET_LENGTH(result)) {
14381 if (PyUnicode_Resize(&result, len) < 0)
14382 Py_CLEAR(result);
14383 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014384 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014385}
14386
Ethan Furmandf3ed242014-01-05 06:50:30 -080014387/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014388 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014389 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014390 * -1 and raise an exception on error */
14391static int
Victor Stinnera47082312012-10-04 02:19:54 +020014392mainformatlong(PyObject *v,
14393 struct unicode_format_arg_t *arg,
14394 PyObject **p_output,
14395 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014396{
14397 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014398 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014399
14400 if (!PyNumber_Check(v))
14401 goto wrongtype;
14402
Ethan Furman9ab74802014-03-21 06:38:46 -070014403 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014404 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014405 if (type == 'o' || type == 'x' || type == 'X') {
14406 iobj = PyNumber_Index(v);
14407 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014408 if (PyErr_ExceptionMatches(PyExc_TypeError))
14409 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014410 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014411 }
14412 }
14413 else {
14414 iobj = PyNumber_Long(v);
14415 if (iobj == NULL ) {
14416 if (PyErr_ExceptionMatches(PyExc_TypeError))
14417 goto wrongtype;
14418 return -1;
14419 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014420 }
14421 assert(PyLong_Check(iobj));
14422 }
14423 else {
14424 iobj = v;
14425 Py_INCREF(iobj);
14426 }
14427
14428 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014429 && arg->width == -1 && arg->prec == -1
14430 && !(arg->flags & (F_SIGN | F_BLANK))
14431 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014432 {
14433 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014434 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014435 int base;
14436
Victor Stinnera47082312012-10-04 02:19:54 +020014437 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014438 {
14439 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014440 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014441 case 'd':
14442 case 'i':
14443 case 'u':
14444 base = 10;
14445 break;
14446 case 'o':
14447 base = 8;
14448 break;
14449 case 'x':
14450 case 'X':
14451 base = 16;
14452 break;
14453 }
14454
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014455 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14456 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014457 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014458 }
14459 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014460 return 1;
14461 }
14462
Ethan Furmanb95b5612015-01-23 20:05:18 -080014463 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014464 Py_DECREF(iobj);
14465 if (res == NULL)
14466 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014467 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014468 return 0;
14469
14470wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014471 switch(type)
14472 {
14473 case 'o':
14474 case 'x':
14475 case 'X':
14476 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014477 "%%%c format: an integer is required, "
14478 "not %.200s",
14479 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014480 break;
14481 default:
14482 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014483 "%%%c format: a number is required, "
14484 "not %.200s",
14485 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014486 break;
14487 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014488 return -1;
14489}
14490
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014491static Py_UCS4
14492formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014493{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014494 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014495 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014496 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014497 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014498 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014499 goto onError;
14500 }
14501 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014502 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014503 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014504 /* make sure number is a type of integer */
14505 if (!PyLong_Check(v)) {
14506 iobj = PyNumber_Index(v);
14507 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014508 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014509 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014510 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014511 Py_DECREF(iobj);
14512 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014513 else {
14514 x = PyLong_AsLong(v);
14515 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014516 if (x == -1 && PyErr_Occurred())
14517 goto onError;
14518
Victor Stinner8faf8212011-12-08 22:14:11 +010014519 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014520 PyErr_SetString(PyExc_OverflowError,
14521 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014522 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014523 }
14524
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014525 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014526 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014527
Benjamin Peterson29060642009-01-31 22:14:21 +000014528 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014529 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014530 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014531 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014532}
14533
Victor Stinnera47082312012-10-04 02:19:54 +020014534/* Parse options of an argument: flags, width, precision.
14535 Handle also "%(name)" syntax.
14536
14537 Return 0 if the argument has been formatted into arg->str.
14538 Return 1 if the argument has been written into ctx->writer,
14539 Raise an exception and return -1 on error. */
14540static int
14541unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14542 struct unicode_format_arg_t *arg)
14543{
14544#define FORMAT_READ(ctx) \
14545 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14546
14547 PyObject *v;
14548
Victor Stinnera47082312012-10-04 02:19:54 +020014549 if (arg->ch == '(') {
14550 /* Get argument value from a dictionary. Example: "%(name)s". */
14551 Py_ssize_t keystart;
14552 Py_ssize_t keylen;
14553 PyObject *key;
14554 int pcount = 1;
14555
14556 if (ctx->dict == NULL) {
14557 PyErr_SetString(PyExc_TypeError,
14558 "format requires a mapping");
14559 return -1;
14560 }
14561 ++ctx->fmtpos;
14562 --ctx->fmtcnt;
14563 keystart = ctx->fmtpos;
14564 /* Skip over balanced parentheses */
14565 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14566 arg->ch = FORMAT_READ(ctx);
14567 if (arg->ch == ')')
14568 --pcount;
14569 else if (arg->ch == '(')
14570 ++pcount;
14571 ctx->fmtpos++;
14572 }
14573 keylen = ctx->fmtpos - keystart - 1;
14574 if (ctx->fmtcnt < 0 || pcount > 0) {
14575 PyErr_SetString(PyExc_ValueError,
14576 "incomplete format key");
14577 return -1;
14578 }
14579 key = PyUnicode_Substring(ctx->fmtstr,
14580 keystart, keystart + keylen);
14581 if (key == NULL)
14582 return -1;
14583 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014584 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014585 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014586 }
14587 ctx->args = PyObject_GetItem(ctx->dict, key);
14588 Py_DECREF(key);
14589 if (ctx->args == NULL)
14590 return -1;
14591 ctx->args_owned = 1;
14592 ctx->arglen = -1;
14593 ctx->argidx = -2;
14594 }
14595
14596 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014597 while (--ctx->fmtcnt >= 0) {
14598 arg->ch = FORMAT_READ(ctx);
14599 ctx->fmtpos++;
14600 switch (arg->ch) {
14601 case '-': arg->flags |= F_LJUST; continue;
14602 case '+': arg->flags |= F_SIGN; continue;
14603 case ' ': arg->flags |= F_BLANK; continue;
14604 case '#': arg->flags |= F_ALT; continue;
14605 case '0': arg->flags |= F_ZERO; continue;
14606 }
14607 break;
14608 }
14609
14610 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014611 if (arg->ch == '*') {
14612 v = unicode_format_getnextarg(ctx);
14613 if (v == NULL)
14614 return -1;
14615 if (!PyLong_Check(v)) {
14616 PyErr_SetString(PyExc_TypeError,
14617 "* wants int");
14618 return -1;
14619 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014620 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014621 if (arg->width == -1 && PyErr_Occurred())
14622 return -1;
14623 if (arg->width < 0) {
14624 arg->flags |= F_LJUST;
14625 arg->width = -arg->width;
14626 }
14627 if (--ctx->fmtcnt >= 0) {
14628 arg->ch = FORMAT_READ(ctx);
14629 ctx->fmtpos++;
14630 }
14631 }
14632 else if (arg->ch >= '0' && arg->ch <= '9') {
14633 arg->width = arg->ch - '0';
14634 while (--ctx->fmtcnt >= 0) {
14635 arg->ch = FORMAT_READ(ctx);
14636 ctx->fmtpos++;
14637 if (arg->ch < '0' || arg->ch > '9')
14638 break;
14639 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14640 mixing signed and unsigned comparison. Since arg->ch is between
14641 '0' and '9', casting to int is safe. */
14642 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14643 PyErr_SetString(PyExc_ValueError,
14644 "width too big");
14645 return -1;
14646 }
14647 arg->width = arg->width*10 + (arg->ch - '0');
14648 }
14649 }
14650
14651 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014652 if (arg->ch == '.') {
14653 arg->prec = 0;
14654 if (--ctx->fmtcnt >= 0) {
14655 arg->ch = FORMAT_READ(ctx);
14656 ctx->fmtpos++;
14657 }
14658 if (arg->ch == '*') {
14659 v = unicode_format_getnextarg(ctx);
14660 if (v == NULL)
14661 return -1;
14662 if (!PyLong_Check(v)) {
14663 PyErr_SetString(PyExc_TypeError,
14664 "* wants int");
14665 return -1;
14666 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014667 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014668 if (arg->prec == -1 && PyErr_Occurred())
14669 return -1;
14670 if (arg->prec < 0)
14671 arg->prec = 0;
14672 if (--ctx->fmtcnt >= 0) {
14673 arg->ch = FORMAT_READ(ctx);
14674 ctx->fmtpos++;
14675 }
14676 }
14677 else if (arg->ch >= '0' && arg->ch <= '9') {
14678 arg->prec = arg->ch - '0';
14679 while (--ctx->fmtcnt >= 0) {
14680 arg->ch = FORMAT_READ(ctx);
14681 ctx->fmtpos++;
14682 if (arg->ch < '0' || arg->ch > '9')
14683 break;
14684 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14685 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014686 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014687 return -1;
14688 }
14689 arg->prec = arg->prec*10 + (arg->ch - '0');
14690 }
14691 }
14692 }
14693
14694 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14695 if (ctx->fmtcnt >= 0) {
14696 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14697 if (--ctx->fmtcnt >= 0) {
14698 arg->ch = FORMAT_READ(ctx);
14699 ctx->fmtpos++;
14700 }
14701 }
14702 }
14703 if (ctx->fmtcnt < 0) {
14704 PyErr_SetString(PyExc_ValueError,
14705 "incomplete format");
14706 return -1;
14707 }
14708 return 0;
14709
14710#undef FORMAT_READ
14711}
14712
14713/* Format one argument. Supported conversion specifiers:
14714
14715 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014716 - "i", "d", "u": int or float
14717 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014718 - "e", "E", "f", "F", "g", "G": float
14719 - "c": int or str (1 character)
14720
Victor Stinner8dbd4212012-12-04 09:30:24 +010014721 When possible, the output is written directly into the Unicode writer
14722 (ctx->writer). A string is created when padding is required.
14723
Victor Stinnera47082312012-10-04 02:19:54 +020014724 Return 0 if the argument has been formatted into *p_str,
14725 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014726 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014727static int
14728unicode_format_arg_format(struct unicode_formatter_t *ctx,
14729 struct unicode_format_arg_t *arg,
14730 PyObject **p_str)
14731{
14732 PyObject *v;
14733 _PyUnicodeWriter *writer = &ctx->writer;
14734
14735 if (ctx->fmtcnt == 0)
14736 ctx->writer.overallocate = 0;
14737
Victor Stinnera47082312012-10-04 02:19:54 +020014738 v = unicode_format_getnextarg(ctx);
14739 if (v == NULL)
14740 return -1;
14741
Victor Stinnera47082312012-10-04 02:19:54 +020014742
14743 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014744 case 's':
14745 case 'r':
14746 case 'a':
14747 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14748 /* Fast path */
14749 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14750 return -1;
14751 return 1;
14752 }
14753
14754 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14755 *p_str = v;
14756 Py_INCREF(*p_str);
14757 }
14758 else {
14759 if (arg->ch == 's')
14760 *p_str = PyObject_Str(v);
14761 else if (arg->ch == 'r')
14762 *p_str = PyObject_Repr(v);
14763 else
14764 *p_str = PyObject_ASCII(v);
14765 }
14766 break;
14767
14768 case 'i':
14769 case 'd':
14770 case 'u':
14771 case 'o':
14772 case 'x':
14773 case 'X':
14774 {
14775 int ret = mainformatlong(v, arg, p_str, writer);
14776 if (ret != 0)
14777 return ret;
14778 arg->sign = 1;
14779 break;
14780 }
14781
14782 case 'e':
14783 case 'E':
14784 case 'f':
14785 case 'F':
14786 case 'g':
14787 case 'G':
14788 if (arg->width == -1 && arg->prec == -1
14789 && !(arg->flags & (F_SIGN | F_BLANK)))
14790 {
14791 /* Fast path */
14792 if (formatfloat(v, arg, NULL, writer) == -1)
14793 return -1;
14794 return 1;
14795 }
14796
14797 arg->sign = 1;
14798 if (formatfloat(v, arg, p_str, NULL) == -1)
14799 return -1;
14800 break;
14801
14802 case 'c':
14803 {
14804 Py_UCS4 ch = formatchar(v);
14805 if (ch == (Py_UCS4) -1)
14806 return -1;
14807 if (arg->width == -1 && arg->prec == -1) {
14808 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014809 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014810 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014811 return 1;
14812 }
14813 *p_str = PyUnicode_FromOrdinal(ch);
14814 break;
14815 }
14816
14817 default:
14818 PyErr_Format(PyExc_ValueError,
14819 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014820 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014821 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14822 (int)arg->ch,
14823 ctx->fmtpos - 1);
14824 return -1;
14825 }
14826 if (*p_str == NULL)
14827 return -1;
14828 assert (PyUnicode_Check(*p_str));
14829 return 0;
14830}
14831
14832static int
14833unicode_format_arg_output(struct unicode_formatter_t *ctx,
14834 struct unicode_format_arg_t *arg,
14835 PyObject *str)
14836{
14837 Py_ssize_t len;
14838 enum PyUnicode_Kind kind;
14839 void *pbuf;
14840 Py_ssize_t pindex;
14841 Py_UCS4 signchar;
14842 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014843 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014844 Py_ssize_t sublen;
14845 _PyUnicodeWriter *writer = &ctx->writer;
14846 Py_UCS4 fill;
14847
14848 fill = ' ';
14849 if (arg->sign && arg->flags & F_ZERO)
14850 fill = '0';
14851
14852 if (PyUnicode_READY(str) == -1)
14853 return -1;
14854
14855 len = PyUnicode_GET_LENGTH(str);
14856 if ((arg->width == -1 || arg->width <= len)
14857 && (arg->prec == -1 || arg->prec >= len)
14858 && !(arg->flags & (F_SIGN | F_BLANK)))
14859 {
14860 /* Fast path */
14861 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14862 return -1;
14863 return 0;
14864 }
14865
14866 /* Truncate the string for "s", "r" and "a" formats
14867 if the precision is set */
14868 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14869 if (arg->prec >= 0 && len > arg->prec)
14870 len = arg->prec;
14871 }
14872
14873 /* Adjust sign and width */
14874 kind = PyUnicode_KIND(str);
14875 pbuf = PyUnicode_DATA(str);
14876 pindex = 0;
14877 signchar = '\0';
14878 if (arg->sign) {
14879 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14880 if (ch == '-' || ch == '+') {
14881 signchar = ch;
14882 len--;
14883 pindex++;
14884 }
14885 else if (arg->flags & F_SIGN)
14886 signchar = '+';
14887 else if (arg->flags & F_BLANK)
14888 signchar = ' ';
14889 else
14890 arg->sign = 0;
14891 }
14892 if (arg->width < len)
14893 arg->width = len;
14894
14895 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014896 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014897 if (!(arg->flags & F_LJUST)) {
14898 if (arg->sign) {
14899 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014900 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014901 }
14902 else {
14903 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014904 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014905 }
14906 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014907 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14908 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014909 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014910 }
14911
Victor Stinnera47082312012-10-04 02:19:54 +020014912 buflen = arg->width;
14913 if (arg->sign && len == arg->width)
14914 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014915 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014916 return -1;
14917
14918 /* Write the sign if needed */
14919 if (arg->sign) {
14920 if (fill != ' ') {
14921 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14922 writer->pos += 1;
14923 }
14924 if (arg->width > len)
14925 arg->width--;
14926 }
14927
14928 /* Write the numeric prefix for "x", "X" and "o" formats
14929 if the alternate form is used.
14930 For example, write "0x" for the "%#x" format. */
14931 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14932 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14933 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14934 if (fill != ' ') {
14935 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14936 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14937 writer->pos += 2;
14938 pindex += 2;
14939 }
14940 arg->width -= 2;
14941 if (arg->width < 0)
14942 arg->width = 0;
14943 len -= 2;
14944 }
14945
14946 /* Pad left with the fill character if needed */
14947 if (arg->width > len && !(arg->flags & F_LJUST)) {
14948 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014949 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014950 writer->pos += sublen;
14951 arg->width = len;
14952 }
14953
14954 /* If padding with spaces: write sign if needed and/or numeric prefix if
14955 the alternate form is used */
14956 if (fill == ' ') {
14957 if (arg->sign) {
14958 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14959 writer->pos += 1;
14960 }
14961 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14962 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14963 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14964 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14965 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14966 writer->pos += 2;
14967 pindex += 2;
14968 }
14969 }
14970
14971 /* Write characters */
14972 if (len) {
14973 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14974 str, pindex, len);
14975 writer->pos += len;
14976 }
14977
14978 /* Pad right with the fill character if needed */
14979 if (arg->width > len) {
14980 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014981 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014982 writer->pos += sublen;
14983 }
14984 return 0;
14985}
14986
14987/* Helper of PyUnicode_Format(): format one arg.
14988 Return 0 on success, raise an exception and return -1 on error. */
14989static int
14990unicode_format_arg(struct unicode_formatter_t *ctx)
14991{
14992 struct unicode_format_arg_t arg;
14993 PyObject *str;
14994 int ret;
14995
Victor Stinner8dbd4212012-12-04 09:30:24 +010014996 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014997 if (arg.ch == '%') {
14998 ctx->fmtpos++;
14999 ctx->fmtcnt--;
15000 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15001 return -1;
15002 return 0;
15003 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015004 arg.flags = 0;
15005 arg.width = -1;
15006 arg.prec = -1;
15007 arg.sign = 0;
15008 str = NULL;
15009
Victor Stinnera47082312012-10-04 02:19:54 +020015010 ret = unicode_format_arg_parse(ctx, &arg);
15011 if (ret == -1)
15012 return -1;
15013
15014 ret = unicode_format_arg_format(ctx, &arg, &str);
15015 if (ret == -1)
15016 return -1;
15017
15018 if (ret != 1) {
15019 ret = unicode_format_arg_output(ctx, &arg, str);
15020 Py_DECREF(str);
15021 if (ret == -1)
15022 return -1;
15023 }
15024
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015025 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015026 PyErr_SetString(PyExc_TypeError,
15027 "not all arguments converted during string formatting");
15028 return -1;
15029 }
15030 return 0;
15031}
15032
Alexander Belopolsky40018472011-02-26 01:02:56 +000015033PyObject *
15034PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015035{
Victor Stinnera47082312012-10-04 02:19:54 +020015036 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015037
Guido van Rossumd57fd912000-03-10 22:53:23 +000015038 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015039 PyErr_BadInternalCall();
15040 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015041 }
Victor Stinnera47082312012-10-04 02:19:54 +020015042
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015043 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015044 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015045
15046 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015047 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15048 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15049 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15050 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015051
Victor Stinner8f674cc2013-04-17 23:02:17 +020015052 _PyUnicodeWriter_Init(&ctx.writer);
15053 ctx.writer.min_length = ctx.fmtcnt + 100;
15054 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015055
Guido van Rossumd57fd912000-03-10 22:53:23 +000015056 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015057 ctx.arglen = PyTuple_Size(args);
15058 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015059 }
15060 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015061 ctx.arglen = -1;
15062 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015063 }
Victor Stinnera47082312012-10-04 02:19:54 +020015064 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015065 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015066 ctx.dict = args;
15067 else
15068 ctx.dict = NULL;
15069 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015070
Victor Stinnera47082312012-10-04 02:19:54 +020015071 while (--ctx.fmtcnt >= 0) {
15072 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015073 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015074
15075 nonfmtpos = ctx.fmtpos++;
15076 while (ctx.fmtcnt >= 0 &&
15077 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15078 ctx.fmtpos++;
15079 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015080 }
Victor Stinnera47082312012-10-04 02:19:54 +020015081 if (ctx.fmtcnt < 0) {
15082 ctx.fmtpos--;
15083 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015084 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015085
Victor Stinnercfc4c132013-04-03 01:48:39 +020015086 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15087 nonfmtpos, ctx.fmtpos) < 0)
15088 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015089 }
15090 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015091 ctx.fmtpos++;
15092 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015093 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015094 }
15095 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015096
Victor Stinnera47082312012-10-04 02:19:54 +020015097 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015098 PyErr_SetString(PyExc_TypeError,
15099 "not all arguments converted during string formatting");
15100 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015101 }
15102
Victor Stinnera47082312012-10-04 02:19:54 +020015103 if (ctx.args_owned) {
15104 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015105 }
Victor Stinnera47082312012-10-04 02:19:54 +020015106 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015107
Benjamin Peterson29060642009-01-31 22:14:21 +000015108 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015109 _PyUnicodeWriter_Dealloc(&ctx.writer);
15110 if (ctx.args_owned) {
15111 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015112 }
15113 return NULL;
15114}
15115
Jeremy Hylton938ace62002-07-17 16:30:39 +000015116static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015117unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15118
Tim Peters6d6c1a32001-08-02 04:15:00 +000015119static PyObject *
15120unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15121{
Benjamin Peterson29060642009-01-31 22:14:21 +000015122 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015123 static char *kwlist[] = {"object", "encoding", "errors", 0};
15124 char *encoding = NULL;
15125 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015126
Benjamin Peterson14339b62009-01-31 16:36:08 +000015127 if (type != &PyUnicode_Type)
15128 return unicode_subtype_new(type, args, kwds);
15129 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015130 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015131 return NULL;
15132 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015133 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015134 if (encoding == NULL && errors == NULL)
15135 return PyObject_Str(x);
15136 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015137 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015138}
15139
Guido van Rossume023fe02001-08-30 03:12:59 +000015140static PyObject *
15141unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15142{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015143 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015144 Py_ssize_t length, char_size;
15145 int share_wstr, share_utf8;
15146 unsigned int kind;
15147 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015148
Benjamin Peterson14339b62009-01-31 16:36:08 +000015149 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015150
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015151 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015152 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015153 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015154 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015155 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015156 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015157 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015158 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015159
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015160 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015161 if (self == NULL) {
15162 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015163 return NULL;
15164 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015165 kind = PyUnicode_KIND(unicode);
15166 length = PyUnicode_GET_LENGTH(unicode);
15167
15168 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015169#ifdef Py_DEBUG
15170 _PyUnicode_HASH(self) = -1;
15171#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015172 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015173#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015174 _PyUnicode_STATE(self).interned = 0;
15175 _PyUnicode_STATE(self).kind = kind;
15176 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015177 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015178 _PyUnicode_STATE(self).ready = 1;
15179 _PyUnicode_WSTR(self) = NULL;
15180 _PyUnicode_UTF8_LENGTH(self) = 0;
15181 _PyUnicode_UTF8(self) = NULL;
15182 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015183 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015184
15185 share_utf8 = 0;
15186 share_wstr = 0;
15187 if (kind == PyUnicode_1BYTE_KIND) {
15188 char_size = 1;
15189 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15190 share_utf8 = 1;
15191 }
15192 else if (kind == PyUnicode_2BYTE_KIND) {
15193 char_size = 2;
15194 if (sizeof(wchar_t) == 2)
15195 share_wstr = 1;
15196 }
15197 else {
15198 assert(kind == PyUnicode_4BYTE_KIND);
15199 char_size = 4;
15200 if (sizeof(wchar_t) == 4)
15201 share_wstr = 1;
15202 }
15203
15204 /* Ensure we won't overflow the length. */
15205 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15206 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015207 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015208 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015209 data = PyObject_MALLOC((length + 1) * char_size);
15210 if (data == NULL) {
15211 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015212 goto onError;
15213 }
15214
Victor Stinnerc3c74152011-10-02 20:39:55 +020015215 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015216 if (share_utf8) {
15217 _PyUnicode_UTF8_LENGTH(self) = length;
15218 _PyUnicode_UTF8(self) = data;
15219 }
15220 if (share_wstr) {
15221 _PyUnicode_WSTR_LENGTH(self) = length;
15222 _PyUnicode_WSTR(self) = (wchar_t *)data;
15223 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015224
Christian Heimesf051e432016-09-13 20:22:02 +020015225 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015226 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015227 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015228#ifdef Py_DEBUG
15229 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15230#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015231 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015232 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015233
15234onError:
15235 Py_DECREF(unicode);
15236 Py_DECREF(self);
15237 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015238}
15239
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015240PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015241"str(object='') -> str\n\
15242str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015243\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015244Create a new string object from the given object. If encoding or\n\
15245errors is specified, then the object must expose a data buffer\n\
15246that will be decoded using the given encoding and error handler.\n\
15247Otherwise, returns the result of object.__str__() (if defined)\n\
15248or repr(object).\n\
15249encoding defaults to sys.getdefaultencoding().\n\
15250errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015251
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015252static PyObject *unicode_iter(PyObject *seq);
15253
Guido van Rossumd57fd912000-03-10 22:53:23 +000015254PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015255 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015256 "str", /* tp_name */
15257 sizeof(PyUnicodeObject), /* tp_basicsize */
15258 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015259 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015260 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015261 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015262 0, /* tp_getattr */
15263 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015264 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015265 unicode_repr, /* tp_repr */
15266 &unicode_as_number, /* tp_as_number */
15267 &unicode_as_sequence, /* tp_as_sequence */
15268 &unicode_as_mapping, /* tp_as_mapping */
15269 (hashfunc) unicode_hash, /* tp_hash*/
15270 0, /* tp_call*/
15271 (reprfunc) unicode_str, /* tp_str */
15272 PyObject_GenericGetAttr, /* tp_getattro */
15273 0, /* tp_setattro */
15274 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015276 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15277 unicode_doc, /* tp_doc */
15278 0, /* tp_traverse */
15279 0, /* tp_clear */
15280 PyUnicode_RichCompare, /* tp_richcompare */
15281 0, /* tp_weaklistoffset */
15282 unicode_iter, /* tp_iter */
15283 0, /* tp_iternext */
15284 unicode_methods, /* tp_methods */
15285 0, /* tp_members */
15286 0, /* tp_getset */
15287 &PyBaseObject_Type, /* tp_base */
15288 0, /* tp_dict */
15289 0, /* tp_descr_get */
15290 0, /* tp_descr_set */
15291 0, /* tp_dictoffset */
15292 0, /* tp_init */
15293 0, /* tp_alloc */
15294 unicode_new, /* tp_new */
15295 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015296};
15297
15298/* Initialize the Unicode implementation */
15299
Victor Stinner331a6a52019-05-27 16:39:22 +020015300PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015301_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015302{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015303 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015304 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015305 0x000A, /* LINE FEED */
15306 0x000D, /* CARRIAGE RETURN */
15307 0x001C, /* FILE SEPARATOR */
15308 0x001D, /* GROUP SEPARATOR */
15309 0x001E, /* RECORD SEPARATOR */
15310 0x0085, /* NEXT LINE */
15311 0x2028, /* LINE SEPARATOR */
15312 0x2029, /* PARAGRAPH SEPARATOR */
15313 };
15314
Fred Drakee4315f52000-05-09 19:53:39 +000015315 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015316 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015317 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015318 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015319 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015320 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015321
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015322 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015323 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015324 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015325
15326 /* initialize the linebreak bloom filter */
15327 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015328 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015329 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015330
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015331 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015332 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015333 }
15334 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015335 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015336 }
15337 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015338 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015339 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015340 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015341}
15342
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015343
Walter Dörwald16807132007-05-25 13:52:07 +000015344void
15345PyUnicode_InternInPlace(PyObject **p)
15346{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015347 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015349#ifdef Py_DEBUG
15350 assert(s != NULL);
15351 assert(_PyUnicode_CHECK(s));
15352#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015353 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015354 return;
15355#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 /* If it's a subclass, we don't really know what putting
15357 it in the interned dict might do. */
15358 if (!PyUnicode_CheckExact(s))
15359 return;
15360 if (PyUnicode_CHECK_INTERNED(s))
15361 return;
15362 if (interned == NULL) {
15363 interned = PyDict_New();
15364 if (interned == NULL) {
15365 PyErr_Clear(); /* Don't leave an exception */
15366 return;
15367 }
15368 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015370 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015371 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015372 if (t == NULL) {
15373 PyErr_Clear();
15374 return;
15375 }
15376 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015377 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015378 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015379 return;
15380 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015381 /* The two references in interned are not counted by refcnt.
15382 The deallocator will take care of this */
15383 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015384 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015385}
15386
15387void
15388PyUnicode_InternImmortal(PyObject **p)
15389{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015390 PyUnicode_InternInPlace(p);
15391 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015392 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015393 Py_INCREF(*p);
15394 }
Walter Dörwald16807132007-05-25 13:52:07 +000015395}
15396
15397PyObject *
15398PyUnicode_InternFromString(const char *cp)
15399{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015400 PyObject *s = PyUnicode_FromString(cp);
15401 if (s == NULL)
15402 return NULL;
15403 PyUnicode_InternInPlace(&s);
15404 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015405}
15406
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015407
15408#if defined(WITH_VALGRIND) || defined(__INSURE__)
15409static void
15410unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015411{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015412 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015413 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015414 }
15415 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 if (keys == NULL || !PyList_Check(keys)) {
15417 PyErr_Clear();
15418 return;
15419 }
Walter Dörwald16807132007-05-25 13:52:07 +000015420
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015421 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 detector, interned unicode strings are not forcibly deallocated;
15423 rather, we give them their stolen references back, and then clear
15424 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015425
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015426 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015427#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015428 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015429 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015430
15431 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015432#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015433 for (Py_ssize_t i = 0; i < n; i++) {
15434 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015435 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015436 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015437 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015438 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015439 case SSTATE_INTERNED_IMMORTAL:
15440 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015441#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015442 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015443#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015444 break;
15445 case SSTATE_INTERNED_MORTAL:
15446 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015447#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015448 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015449#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015450 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015451 case SSTATE_NOT_INTERNED:
15452 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015453 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015454 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015455 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015456 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015457 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015458#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015459 fprintf(stderr, "total size of all interned strings: "
15460 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15461 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015462#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015463 Py_DECREF(keys);
15464 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015465 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015466}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015467#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015468
15469
15470/********************* Unicode Iterator **************************/
15471
15472typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015473 PyObject_HEAD
15474 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015475 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015476} unicodeiterobject;
15477
15478static void
15479unicodeiter_dealloc(unicodeiterobject *it)
15480{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015481 _PyObject_GC_UNTRACK(it);
15482 Py_XDECREF(it->it_seq);
15483 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015484}
15485
15486static int
15487unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15488{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015489 Py_VISIT(it->it_seq);
15490 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015491}
15492
15493static PyObject *
15494unicodeiter_next(unicodeiterobject *it)
15495{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015496 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015497
Benjamin Peterson14339b62009-01-31 16:36:08 +000015498 assert(it != NULL);
15499 seq = it->it_seq;
15500 if (seq == NULL)
15501 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015502 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015503
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015504 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15505 int kind = PyUnicode_KIND(seq);
15506 void *data = PyUnicode_DATA(seq);
15507 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15508 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015509 if (item != NULL)
15510 ++it->it_index;
15511 return item;
15512 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015513
Benjamin Peterson14339b62009-01-31 16:36:08 +000015514 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015515 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015516 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015517}
15518
15519static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015520unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015521{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015522 Py_ssize_t len = 0;
15523 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015524 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015525 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015526}
15527
15528PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15529
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015530static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015531unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015532{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015533 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015534 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015535 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015536 it->it_seq, it->it_index);
15537 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015538 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015539 if (u == NULL)
15540 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015541 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015542 }
15543}
15544
15545PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15546
15547static PyObject *
15548unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15549{
15550 Py_ssize_t index = PyLong_AsSsize_t(state);
15551 if (index == -1 && PyErr_Occurred())
15552 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015553 if (it->it_seq != NULL) {
15554 if (index < 0)
15555 index = 0;
15556 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15557 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15558 it->it_index = index;
15559 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015560 Py_RETURN_NONE;
15561}
15562
15563PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15564
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015565static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015566 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015567 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015568 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15569 reduce_doc},
15570 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15571 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015572 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015573};
15574
15575PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015576 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15577 "str_iterator", /* tp_name */
15578 sizeof(unicodeiterobject), /* tp_basicsize */
15579 0, /* tp_itemsize */
15580 /* methods */
15581 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015582 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015583 0, /* tp_getattr */
15584 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015585 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015586 0, /* tp_repr */
15587 0, /* tp_as_number */
15588 0, /* tp_as_sequence */
15589 0, /* tp_as_mapping */
15590 0, /* tp_hash */
15591 0, /* tp_call */
15592 0, /* tp_str */
15593 PyObject_GenericGetAttr, /* tp_getattro */
15594 0, /* tp_setattro */
15595 0, /* tp_as_buffer */
15596 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15597 0, /* tp_doc */
15598 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15599 0, /* tp_clear */
15600 0, /* tp_richcompare */
15601 0, /* tp_weaklistoffset */
15602 PyObject_SelfIter, /* tp_iter */
15603 (iternextfunc)unicodeiter_next, /* tp_iternext */
15604 unicodeiter_methods, /* tp_methods */
15605 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015606};
15607
15608static PyObject *
15609unicode_iter(PyObject *seq)
15610{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015611 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015612
Benjamin Peterson14339b62009-01-31 16:36:08 +000015613 if (!PyUnicode_Check(seq)) {
15614 PyErr_BadInternalCall();
15615 return NULL;
15616 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015617 if (PyUnicode_READY(seq) == -1)
15618 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015619 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15620 if (it == NULL)
15621 return NULL;
15622 it->it_index = 0;
15623 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015624 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015625 _PyObject_GC_TRACK(it);
15626 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015627}
15628
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015629
15630size_t
15631Py_UNICODE_strlen(const Py_UNICODE *u)
15632{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015633 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015634}
15635
15636Py_UNICODE*
15637Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15638{
15639 Py_UNICODE *u = s1;
15640 while ((*u++ = *s2++));
15641 return s1;
15642}
15643
15644Py_UNICODE*
15645Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15646{
15647 Py_UNICODE *u = s1;
15648 while ((*u++ = *s2++))
15649 if (n-- == 0)
15650 break;
15651 return s1;
15652}
15653
15654Py_UNICODE*
15655Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15656{
15657 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015658 u1 += wcslen(u1);
15659 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015660 return s1;
15661}
15662
15663int
15664Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15665{
15666 while (*s1 && *s2 && *s1 == *s2)
15667 s1++, s2++;
15668 if (*s1 && *s2)
15669 return (*s1 < *s2) ? -1 : +1;
15670 if (*s1)
15671 return 1;
15672 if (*s2)
15673 return -1;
15674 return 0;
15675}
15676
15677int
15678Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15679{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015680 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015681 for (; n != 0; n--) {
15682 u1 = *s1;
15683 u2 = *s2;
15684 if (u1 != u2)
15685 return (u1 < u2) ? -1 : +1;
15686 if (u1 == '\0')
15687 return 0;
15688 s1++;
15689 s2++;
15690 }
15691 return 0;
15692}
15693
15694Py_UNICODE*
15695Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15696{
15697 const Py_UNICODE *p;
15698 for (p = s; *p; p++)
15699 if (*p == c)
15700 return (Py_UNICODE*)p;
15701 return NULL;
15702}
15703
15704Py_UNICODE*
15705Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15706{
15707 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015708 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015709 while (p != s) {
15710 p--;
15711 if (*p == c)
15712 return (Py_UNICODE*)p;
15713 }
15714 return NULL;
15715}
Victor Stinner331ea922010-08-10 16:37:20 +000015716
Victor Stinner71133ff2010-09-01 23:43:53 +000015717Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015718PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015719{
Victor Stinner577db2c2011-10-11 22:12:48 +020015720 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015721 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015722
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015723 if (!PyUnicode_Check(unicode)) {
15724 PyErr_BadArgument();
15725 return NULL;
15726 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015727 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015728 if (u == NULL)
15729 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015730 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015731 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015732 PyErr_NoMemory();
15733 return NULL;
15734 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015735 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015736 size *= sizeof(Py_UNICODE);
15737 copy = PyMem_Malloc(size);
15738 if (copy == NULL) {
15739 PyErr_NoMemory();
15740 return NULL;
15741 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015742 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015743 return copy;
15744}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015745
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015746
Victor Stinner709d23d2019-05-02 14:56:30 -040015747static int
15748encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015749{
Victor Stinner709d23d2019-05-02 14:56:30 -040015750 int res;
15751 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15752 if (res == -2) {
15753 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15754 return -1;
15755 }
15756 if (res < 0) {
15757 PyErr_NoMemory();
15758 return -1;
15759 }
15760 return 0;
15761}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015762
Victor Stinner709d23d2019-05-02 14:56:30 -040015763
15764static int
15765config_get_codec_name(wchar_t **config_encoding)
15766{
15767 char *encoding;
15768 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15769 return -1;
15770 }
15771
15772 PyObject *name_obj = NULL;
15773 PyObject *codec = _PyCodec_Lookup(encoding);
15774 PyMem_RawFree(encoding);
15775
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015776 if (!codec)
15777 goto error;
15778
15779 name_obj = PyObject_GetAttrString(codec, "name");
15780 Py_CLEAR(codec);
15781 if (!name_obj) {
15782 goto error;
15783 }
15784
Victor Stinner709d23d2019-05-02 14:56:30 -040015785 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15786 Py_DECREF(name_obj);
15787 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015788 goto error;
15789 }
15790
Victor Stinner709d23d2019-05-02 14:56:30 -040015791 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15792 if (raw_wname == NULL) {
15793 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015794 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015795 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015796 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015797
15798 PyMem_RawFree(*config_encoding);
15799 *config_encoding = raw_wname;
15800
15801 PyMem_Free(wname);
15802 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015803
15804error:
15805 Py_XDECREF(codec);
15806 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015807 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015808}
15809
15810
Victor Stinner331a6a52019-05-27 16:39:22 +020015811static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015812init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015813{
Victor Stinner709d23d2019-05-02 14:56:30 -040015814 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015815 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015816 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015817 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015818 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015819 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015820 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015821}
15822
15823
Victor Stinner709d23d2019-05-02 14:56:30 -040015824static int
15825init_fs_codec(PyInterpreterState *interp)
15826{
Victor Stinner331a6a52019-05-27 16:39:22 +020015827 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015828
15829 _Py_error_handler error_handler;
15830 error_handler = get_error_handler_wide(config->filesystem_errors);
15831 if (error_handler == _Py_ERROR_UNKNOWN) {
15832 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15833 return -1;
15834 }
15835
15836 char *encoding, *errors;
15837 if (encode_wstr_utf8(config->filesystem_encoding,
15838 &encoding,
15839 "filesystem_encoding") < 0) {
15840 return -1;
15841 }
15842
15843 if (encode_wstr_utf8(config->filesystem_errors,
15844 &errors,
15845 "filesystem_errors") < 0) {
15846 PyMem_RawFree(encoding);
15847 return -1;
15848 }
15849
15850 PyMem_RawFree(interp->fs_codec.encoding);
15851 interp->fs_codec.encoding = encoding;
15852 PyMem_RawFree(interp->fs_codec.errors);
15853 interp->fs_codec.errors = errors;
15854 interp->fs_codec.error_handler = error_handler;
15855
15856 /* At this point, PyUnicode_EncodeFSDefault() and
15857 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15858 the C implementation of the filesystem encoding. */
15859
15860 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15861 global configuration variables. */
15862 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15863 interp->fs_codec.errors) < 0) {
15864 PyErr_NoMemory();
15865 return -1;
15866 }
15867 return 0;
15868}
15869
15870
Victor Stinner331a6a52019-05-27 16:39:22 +020015871static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015872init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015873{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015874 PyInterpreterState *interp = tstate->interp;
15875
Victor Stinner709d23d2019-05-02 14:56:30 -040015876 /* Update the filesystem encoding to the normalized Python codec name.
15877 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15878 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015879 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015880 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015881 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015882 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015883 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015884 }
15885
Victor Stinner709d23d2019-05-02 14:56:30 -040015886 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015887 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015888 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015889 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015890}
15891
15892
Victor Stinner331a6a52019-05-27 16:39:22 +020015893PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015894_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015895{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015896 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015897 if (_PyStatus_EXCEPTION(status)) {
15898 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015899 }
15900
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015901 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015902}
15903
15904
Victor Stinner709d23d2019-05-02 14:56:30 -040015905#ifdef MS_WINDOWS
15906int
15907_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15908{
15909 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015910 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015911
15912 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15913 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15914 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15915 if (encoding == NULL || errors == NULL) {
15916 PyMem_RawFree(encoding);
15917 PyMem_RawFree(errors);
15918 PyErr_NoMemory();
15919 return -1;
15920 }
15921
15922 PyMem_RawFree(config->filesystem_encoding);
15923 config->filesystem_encoding = encoding;
15924 PyMem_RawFree(config->filesystem_errors);
15925 config->filesystem_errors = errors;
15926
15927 return init_fs_codec(interp);
15928}
15929#endif
15930
15931
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015932void
Victor Stinner3d483342019-11-22 12:27:50 +010015933_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015934{
Victor Stinner3d483342019-11-22 12:27:50 +010015935 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015936#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010015937 /* Insure++ is a memory analysis tool that aids in discovering
15938 * memory leaks and other memory problems. On Python exit, the
15939 * interned string dictionaries are flagged as being in use at exit
15940 * (which it is). Under normal circumstances, this is fine because
15941 * the memory will be automatically reclaimed by the system. Under
15942 * memory debugging, it's a huge source of useless noise, so we
15943 * trade off slower shutdown for less distraction in the memory
15944 * reports. -baw
15945 */
15946 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015947#endif /* __INSURE__ */
15948
Victor Stinner3d483342019-11-22 12:27:50 +010015949 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015950
Victor Stinner3d483342019-11-22 12:27:50 +010015951 for (Py_ssize_t i = 0; i < 256; i++) {
15952 Py_CLEAR(unicode_latin1[i]);
15953 }
15954 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015955 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015956
15957 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15958 PyMem_RawFree(interp->fs_codec.encoding);
15959 interp->fs_codec.encoding = NULL;
15960 PyMem_RawFree(interp->fs_codec.errors);
15961 interp->fs_codec.errors = NULL;
Pablo Galindo016b0282019-12-02 18:09:43 +000015962 interp->config.filesystem_errors = (wchar_t *)_Py_ERROR_UNKNOWN;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015963}
15964
15965
Georg Brandl66c221e2010-10-14 07:04:07 +000015966/* A _string module, to export formatter_parser and formatter_field_name_split
15967 to the string.Formatter class implemented in Python. */
15968
15969static PyMethodDef _string_methods[] = {
15970 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15971 METH_O, PyDoc_STR("split the argument as a field name")},
15972 {"formatter_parser", (PyCFunction) formatter_parser,
15973 METH_O, PyDoc_STR("parse the argument as a format string")},
15974 {NULL, NULL}
15975};
15976
15977static struct PyModuleDef _string_module = {
15978 PyModuleDef_HEAD_INIT,
15979 "_string",
15980 PyDoc_STR("string helper module"),
15981 0,
15982 _string_methods,
15983 NULL,
15984 NULL,
15985 NULL,
15986 NULL
15987};
15988
15989PyMODINIT_FUNC
15990PyInit__string(void)
15991{
15992 return PyModule_Create(&_string_module);
15993}
15994
15995
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015996#ifdef __cplusplus
15997}
15998#endif