blob: 6ec4127ff385b3323644c2ea050fd26f7ddffa0b [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Victor Stinner709d23d2019-05-02 14:56:30 -0400268static PyObject *
269unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270 const char *errors);
271static PyObject *
272unicode_decode_utf8(const char *s, Py_ssize_t size,
273 _Py_error_handler error_handler, const char *errors,
274 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200276/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200277static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279/* Single character Unicode strings in the Latin-1 range are being
280 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200281static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282
Christian Heimes190d79e2008-01-30 11:58:22 +0000283/* Fast detection of the most frequent whitespace characters */
284const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000286/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000C: * FORM FEED */
290/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 1, 1, 1, 1, 1, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* case 0x001C: * FILE SEPARATOR */
294/* case 0x001D: * GROUP SEPARATOR */
295/* case 0x001E: * RECORD SEPARATOR */
296/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 1, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000303
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000312};
313
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200314/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200315static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100317static int unicode_modifiable(PyObject *unicode);
318
Victor Stinnerfe226c02011-10-03 03:52:20 +0200319
Alexander Belopolsky40018472011-02-26 01:02:56 +0000320static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100321_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200322static PyObject *
323_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
324static PyObject *
325_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
326
327static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000328unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000329 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100330 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
332
Alexander Belopolsky40018472011-02-26 01:02:56 +0000333static void
334raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300335 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100336 PyObject *unicode,
337 Py_ssize_t startpos, Py_ssize_t endpos,
338 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000339
Christian Heimes190d79e2008-01-30 11:58:22 +0000340/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200341static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000343/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000344/* 0x000B, * LINE TABULATION */
345/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000348 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x001C, * FILE SEPARATOR */
350/* 0x001D, * GROUP SEPARATOR */
351/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000352 0, 0, 0, 0, 1, 1, 1, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000357
Benjamin Peterson14339b62009-01-31 16:36:08 +0000358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000366};
367
INADA Naoki3ae20562017-01-16 20:41:20 +0900368static int convert_uc(PyObject *obj, void *addr);
369
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300370#include "clinic/unicodeobject.c.h"
371
Victor Stinner3d4226a2018-08-29 22:21:32 +0200372_Py_error_handler
373_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200374{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200376 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200385 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200394 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_OTHER;
397}
398
Victor Stinner709d23d2019-05-02 14:56:30 -0400399
400static _Py_error_handler
401get_error_handler_wide(const wchar_t *errors)
402{
403 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
404 return _Py_ERROR_STRICT;
405 }
406 if (wcscmp(errors, L"surrogateescape") == 0) {
407 return _Py_ERROR_SURROGATEESCAPE;
408 }
409 if (wcscmp(errors, L"replace") == 0) {
410 return _Py_ERROR_REPLACE;
411 }
412 if (wcscmp(errors, L"ignore") == 0) {
413 return _Py_ERROR_IGNORE;
414 }
415 if (wcscmp(errors, L"backslashreplace") == 0) {
416 return _Py_ERROR_BACKSLASHREPLACE;
417 }
418 if (wcscmp(errors, L"surrogatepass") == 0) {
419 return _Py_ERROR_SURROGATEPASS;
420 }
421 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
422 return _Py_ERROR_XMLCHARREFREPLACE;
423 }
424 return _Py_ERROR_OTHER;
425}
426
427
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300428/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
429 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000430Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000431PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000432{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000433#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000434 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000435#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000436 /* This is actually an illegal character, so it should
437 not be passed to unichr. */
438 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000439#endif
440}
441
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200442int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100443_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200444{
445 PyASCIIObject *ascii;
446 unsigned int kind;
447
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200448 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200449
450 ascii = (PyASCIIObject *)op;
451 kind = ascii->state.kind;
452
Victor Stinnera3b334d2011-10-03 13:53:37 +0200453 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200454 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
455 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200456 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200457 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200458 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200459 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200460
Victor Stinnera41463c2011-10-04 01:05:08 +0200461 if (ascii->state.compact == 1) {
462 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200463 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
464 || kind == PyUnicode_2BYTE_KIND
465 || kind == PyUnicode_4BYTE_KIND);
466 _PyObject_ASSERT(op, ascii->state.ascii == 0);
467 _PyObject_ASSERT(op, ascii->state.ready == 1);
468 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100469 }
470 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
472
473 data = unicode->data.any;
474 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200475 _PyObject_ASSERT(op, ascii->length == 0);
476 _PyObject_ASSERT(op, ascii->hash == -1);
477 _PyObject_ASSERT(op, ascii->state.compact == 0);
478 _PyObject_ASSERT(op, ascii->state.ascii == 0);
479 _PyObject_ASSERT(op, ascii->state.ready == 0);
480 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
481 _PyObject_ASSERT(op, ascii->wstr != NULL);
482 _PyObject_ASSERT(op, data == NULL);
483 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200484 }
485 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200486 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
487 || kind == PyUnicode_2BYTE_KIND
488 || kind == PyUnicode_4BYTE_KIND);
489 _PyObject_ASSERT(op, ascii->state.compact == 0);
490 _PyObject_ASSERT(op, ascii->state.ready == 1);
491 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200492 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200493 _PyObject_ASSERT(op, compact->utf8 == data);
494 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200495 }
496 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200497 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200498 }
499 }
500 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200501 if (
502#if SIZEOF_WCHAR_T == 2
503 kind == PyUnicode_2BYTE_KIND
504#else
505 kind == PyUnicode_4BYTE_KIND
506#endif
507 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200508 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200509 _PyObject_ASSERT(op, ascii->wstr == data);
510 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200511 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200512 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200513 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200514
515 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200516 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200518 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200519 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520
521 /* check that the best kind is used: O(n) operation */
522 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200523 Py_ssize_t i;
524 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200525 void *data;
526 Py_UCS4 ch;
527
528 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200529 for (i=0; i < ascii->length; i++)
530 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200531 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200532 if (ch > maxchar)
533 maxchar = ch;
534 }
535 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100536 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 _PyObject_ASSERT(op, maxchar >= 128);
538 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100539 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200540 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200541 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200542 }
Victor Stinner77faf692011-11-20 18:56:05 +0100543 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200544 _PyObject_ASSERT(op, maxchar >= 0x100);
545 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100546 }
547 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200548 _PyObject_ASSERT(op, maxchar >= 0x10000);
549 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100550 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200551 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200552 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400553 return 1;
554}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200555
Victor Stinner910337b2011-10-03 03:20:16 +0200556
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100557static PyObject*
558unicode_result_wchar(PyObject *unicode)
559{
560#ifndef Py_DEBUG
561 Py_ssize_t len;
562
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100563 len = _PyUnicode_WSTR_LENGTH(unicode);
564 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100565 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200566 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100567 }
568
569 if (len == 1) {
570 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100571 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
573 Py_DECREF(unicode);
574 return latin1_char;
575 }
576 }
577
578 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200579 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100580 return NULL;
581 }
582#else
Victor Stinneraa771272012-10-04 02:32:58 +0200583 assert(Py_REFCNT(unicode) == 1);
584
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100585 /* don't make the result ready in debug mode to ensure that the caller
586 makes the string ready before using it */
587 assert(_PyUnicode_CheckConsistency(unicode, 1));
588#endif
589 return unicode;
590}
591
592static PyObject*
593unicode_result_ready(PyObject *unicode)
594{
595 Py_ssize_t length;
596
597 length = PyUnicode_GET_LENGTH(unicode);
598 if (length == 0) {
599 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100600 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200601 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100602 }
603 return unicode_empty;
604 }
605
606 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200607 void *data = PyUnicode_DATA(unicode);
608 int kind = PyUnicode_KIND(unicode);
609 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100610 if (ch < 256) {
611 PyObject *latin1_char = unicode_latin1[ch];
612 if (latin1_char != NULL) {
613 if (unicode != latin1_char) {
614 Py_INCREF(latin1_char);
615 Py_DECREF(unicode);
616 }
617 return latin1_char;
618 }
619 else {
620 assert(_PyUnicode_CheckConsistency(unicode, 1));
621 Py_INCREF(unicode);
622 unicode_latin1[ch] = unicode;
623 return unicode;
624 }
625 }
626 }
627
628 assert(_PyUnicode_CheckConsistency(unicode, 1));
629 return unicode;
630}
631
632static PyObject*
633unicode_result(PyObject *unicode)
634{
635 assert(_PyUnicode_CHECK(unicode));
636 if (PyUnicode_IS_READY(unicode))
637 return unicode_result_ready(unicode);
638 else
639 return unicode_result_wchar(unicode);
640}
641
Victor Stinnerc4b49542011-12-11 22:44:26 +0100642static PyObject*
643unicode_result_unchanged(PyObject *unicode)
644{
645 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500646 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100647 return NULL;
648 Py_INCREF(unicode);
649 return unicode;
650 }
651 else
652 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100653 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100654}
655
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200656/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
657 ASCII, Latin1, UTF-8, etc. */
658static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200659backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200660 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
661{
Victor Stinnerad771582015-10-09 12:38:53 +0200662 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200663 Py_UCS4 ch;
664 enum PyUnicode_Kind kind;
665 void *data;
666
667 assert(PyUnicode_IS_READY(unicode));
668 kind = PyUnicode_KIND(unicode);
669 data = PyUnicode_DATA(unicode);
670
671 size = 0;
672 /* determine replacement size */
673 for (i = collstart; i < collend; ++i) {
674 Py_ssize_t incr;
675
676 ch = PyUnicode_READ(kind, data, i);
677 if (ch < 0x100)
678 incr = 2+2;
679 else if (ch < 0x10000)
680 incr = 2+4;
681 else {
682 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200683 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200684 }
685 if (size > PY_SSIZE_T_MAX - incr) {
686 PyErr_SetString(PyExc_OverflowError,
687 "encoded result is too long for a Python string");
688 return NULL;
689 }
690 size += incr;
691 }
692
Victor Stinnerad771582015-10-09 12:38:53 +0200693 str = _PyBytesWriter_Prepare(writer, str, size);
694 if (str == NULL)
695 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200696
697 /* generate replacement */
698 for (i = collstart; i < collend; ++i) {
699 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200700 *str++ = '\\';
701 if (ch >= 0x00010000) {
702 *str++ = 'U';
703 *str++ = Py_hexdigits[(ch>>28)&0xf];
704 *str++ = Py_hexdigits[(ch>>24)&0xf];
705 *str++ = Py_hexdigits[(ch>>20)&0xf];
706 *str++ = Py_hexdigits[(ch>>16)&0xf];
707 *str++ = Py_hexdigits[(ch>>12)&0xf];
708 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200709 }
Victor Stinner797485e2015-10-09 03:17:30 +0200710 else if (ch >= 0x100) {
711 *str++ = 'u';
712 *str++ = Py_hexdigits[(ch>>12)&0xf];
713 *str++ = Py_hexdigits[(ch>>8)&0xf];
714 }
715 else
716 *str++ = 'x';
717 *str++ = Py_hexdigits[(ch>>4)&0xf];
718 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200719 }
720 return str;
721}
722
723/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
724 ASCII, Latin1, UTF-8, etc. */
725static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200726xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200727 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
728{
Victor Stinnerad771582015-10-09 12:38:53 +0200729 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200730 Py_UCS4 ch;
731 enum PyUnicode_Kind kind;
732 void *data;
733
734 assert(PyUnicode_IS_READY(unicode));
735 kind = PyUnicode_KIND(unicode);
736 data = PyUnicode_DATA(unicode);
737
738 size = 0;
739 /* determine replacement size */
740 for (i = collstart; i < collend; ++i) {
741 Py_ssize_t incr;
742
743 ch = PyUnicode_READ(kind, data, i);
744 if (ch < 10)
745 incr = 2+1+1;
746 else if (ch < 100)
747 incr = 2+2+1;
748 else if (ch < 1000)
749 incr = 2+3+1;
750 else if (ch < 10000)
751 incr = 2+4+1;
752 else if (ch < 100000)
753 incr = 2+5+1;
754 else if (ch < 1000000)
755 incr = 2+6+1;
756 else {
757 assert(ch <= MAX_UNICODE);
758 incr = 2+7+1;
759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
775 }
776 return str;
777}
778
Thomas Wouters477c8d52006-05-27 19:21:47 +0000779/* --- Bloom Filters ----------------------------------------------------- */
780
781/* stuff to implement simple "bloom filters" for Unicode characters.
782 to keep things simple, we use a single bitmask, using the least 5
783 bits from each unicode characters as the bit index. */
784
785/* the linebreak mask is set up by Unicode_Init below */
786
Antoine Pitrouf068f942010-01-13 14:19:12 +0000787#if LONG_BIT >= 128
788#define BLOOM_WIDTH 128
789#elif LONG_BIT >= 64
790#define BLOOM_WIDTH 64
791#elif LONG_BIT >= 32
792#define BLOOM_WIDTH 32
793#else
794#error "LONG_BIT is smaller than 32"
795#endif
796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797#define BLOOM_MASK unsigned long
798
Serhiy Storchaka05997252013-01-26 12:14:02 +0200799static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000800
Antoine Pitrouf068f942010-01-13 14:19:12 +0000801#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802
Benjamin Peterson29060642009-01-31 22:14:21 +0000803#define BLOOM_LINEBREAK(ch) \
804 ((ch) < 128U ? ascii_linebreak[(ch)] : \
805 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700807static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000809{
Victor Stinnera85af502013-04-09 21:53:54 +0200810#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
811 do { \
812 TYPE *data = (TYPE *)PTR; \
813 TYPE *end = data + LEN; \
814 Py_UCS4 ch; \
815 for (; data != end; data++) { \
816 ch = *data; \
817 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
818 } \
819 break; \
820 } while (0)
821
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 /* calculate simple bloom-style bitmask for a given unicode string */
823
Antoine Pitrouf068f942010-01-13 14:19:12 +0000824 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000825
826 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200827 switch (kind) {
828 case PyUnicode_1BYTE_KIND:
829 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
830 break;
831 case PyUnicode_2BYTE_KIND:
832 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
833 break;
834 case PyUnicode_4BYTE_KIND:
835 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
836 break;
837 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700838 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200839 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000840 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200841
842#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000843}
844
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300845static int
846ensure_unicode(PyObject *obj)
847{
848 if (!PyUnicode_Check(obj)) {
849 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200850 "must be str, not %.100s",
851 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300852 return -1;
853 }
854 return PyUnicode_READY(obj);
855}
856
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857/* Compilation of templated routines */
858
859#include "stringlib/asciilib.h"
860#include "stringlib/fastsearch.h"
861#include "stringlib/partition.h"
862#include "stringlib/split.h"
863#include "stringlib/count.h"
864#include "stringlib/find.h"
865#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200866#include "stringlib/undef.h"
867
868#include "stringlib/ucs1lib.h"
869#include "stringlib/fastsearch.h"
870#include "stringlib/partition.h"
871#include "stringlib/split.h"
872#include "stringlib/count.h"
873#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300874#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200875#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200876#include "stringlib/undef.h"
877
878#include "stringlib/ucs2lib.h"
879#include "stringlib/fastsearch.h"
880#include "stringlib/partition.h"
881#include "stringlib/split.h"
882#include "stringlib/count.h"
883#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300884#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200885#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200886#include "stringlib/undef.h"
887
888#include "stringlib/ucs4lib.h"
889#include "stringlib/fastsearch.h"
890#include "stringlib/partition.h"
891#include "stringlib/split.h"
892#include "stringlib/count.h"
893#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300894#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200895#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200896#include "stringlib/undef.h"
897
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200898#include "stringlib/unicodedefs.h"
899#include "stringlib/fastsearch.h"
900#include "stringlib/count.h"
901#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100902#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200903
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904/* --- Unicode Object ----------------------------------------------------- */
905
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700906static inline Py_ssize_t
907findchar(const void *s, int kind,
908 Py_ssize_t size, Py_UCS4 ch,
909 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200911 switch (kind) {
912 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200913 if ((Py_UCS1) ch != ch)
914 return -1;
915 if (direction > 0)
916 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
917 else
918 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200919 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200920 if ((Py_UCS2) ch != ch)
921 return -1;
922 if (direction > 0)
923 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
924 else
925 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200926 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200927 if (direction > 0)
928 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
929 else
930 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200931 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700932 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934}
935
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000937/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200938 earlier.
939
940 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
941 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
942 invalid character in Unicode 6.0. */
943static void
944unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
945{
946 int kind = PyUnicode_KIND(unicode);
947 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
948 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
949 if (length <= old_length)
950 return;
951 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
952}
953#endif
954
Victor Stinnerfe226c02011-10-03 03:52:20 +0200955static PyObject*
956resize_compact(PyObject *unicode, Py_ssize_t length)
957{
958 Py_ssize_t char_size;
959 Py_ssize_t struct_size;
960 Py_ssize_t new_size;
961 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100962 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200963#ifdef Py_DEBUG
964 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
965#endif
966
Victor Stinner79891572012-05-03 13:43:07 +0200967 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100969 assert(PyUnicode_IS_COMPACT(unicode));
970
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200971 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100972 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 struct_size = sizeof(PyASCIIObject);
974 else
975 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200976 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
979 PyErr_NoMemory();
980 return NULL;
981 }
982 new_size = (struct_size + (length + 1) * char_size);
983
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200984 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
985 PyObject_DEL(_PyUnicode_UTF8(unicode));
986 _PyUnicode_UTF8(unicode) = NULL;
987 _PyUnicode_UTF8_LENGTH(unicode) = 0;
988 }
Victor Stinner84def372011-12-11 20:04:56 +0100989 _Py_DEC_REFTOTAL;
990 _Py_ForgetReference(unicode);
991
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300992 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100993 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100994 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 PyErr_NoMemory();
996 return NULL;
997 }
Victor Stinner84def372011-12-11 20:04:56 +0100998 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001000
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001004 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001005 _PyUnicode_WSTR_LENGTH(unicode) = length;
1006 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001007 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1008 PyObject_DEL(_PyUnicode_WSTR(unicode));
1009 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001010 if (!PyUnicode_IS_ASCII(unicode))
1011 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001012 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001013#ifdef Py_DEBUG
1014 unicode_fill_invalid(unicode, old_length);
1015#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001016 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1017 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 return unicode;
1020}
1021
Alexander Belopolsky40018472011-02-26 01:02:56 +00001022static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001023resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024{
Victor Stinner95663112011-10-04 01:03:50 +02001025 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001026 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 if (PyUnicode_IS_READY(unicode)) {
1031 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001032 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1036#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001037
1038 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001039 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001040 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001042
1043 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1044 PyErr_NoMemory();
1045 return -1;
1046 }
1047 new_size = (length + 1) * char_size;
1048
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1050 {
1051 PyObject_DEL(_PyUnicode_UTF8(unicode));
1052 _PyUnicode_UTF8(unicode) = NULL;
1053 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1054 }
1055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 data = (PyObject *)PyObject_REALLOC(data, new_size);
1057 if (data == NULL) {
1058 PyErr_NoMemory();
1059 return -1;
1060 }
1061 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001062 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 _PyUnicode_WSTR_LENGTH(unicode) = length;
1065 }
1066 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001067 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001068 _PyUnicode_UTF8_LENGTH(unicode) = length;
1069 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 _PyUnicode_LENGTH(unicode) = length;
1071 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001072#ifdef Py_DEBUG
1073 unicode_fill_invalid(unicode, old_length);
1074#endif
Victor Stinner95663112011-10-04 01:03:50 +02001075 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001076 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 }
Victor Stinner95663112011-10-04 01:03:50 +02001080 assert(_PyUnicode_WSTR(unicode) != NULL);
1081
1082 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001083 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001084 PyErr_NoMemory();
1085 return -1;
1086 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001087 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001088 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001089 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001090 if (!wstr) {
1091 PyErr_NoMemory();
1092 return -1;
1093 }
1094 _PyUnicode_WSTR(unicode) = wstr;
1095 _PyUnicode_WSTR(unicode)[length] = 0;
1096 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 return 0;
1099}
1100
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101static PyObject*
1102resize_copy(PyObject *unicode, Py_ssize_t length)
1103{
1104 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001106 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001107
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001108 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109
1110 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1111 if (copy == NULL)
1112 return NULL;
1113
1114 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001115 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001117 }
1118 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001119 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001120
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001121 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 if (w == NULL)
1123 return NULL;
1124 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1125 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001126 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001127 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001128 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 }
1130}
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001133 Ux0000 terminated; some code (e.g. new_identifier)
1134 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135
1136 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001137 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139*/
1140
Alexander Belopolsky40018472011-02-26 01:02:56 +00001141static PyUnicodeObject *
1142_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001144 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146
Thomas Wouters477c8d52006-05-27 19:21:47 +00001147 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 if (length == 0 && unicode_empty != NULL) {
1149 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001150 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 }
1152
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001153 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001154 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001155 return (PyUnicodeObject *)PyErr_NoMemory();
1156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157 if (length < 0) {
1158 PyErr_SetString(PyExc_SystemError,
1159 "Negative size passed to _PyUnicode_New");
1160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 }
1162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1164 if (unicode == NULL)
1165 return NULL;
1166 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001167
1168 _PyUnicode_WSTR_LENGTH(unicode) = length;
1169 _PyUnicode_HASH(unicode) = -1;
1170 _PyUnicode_STATE(unicode).interned = 0;
1171 _PyUnicode_STATE(unicode).kind = 0;
1172 _PyUnicode_STATE(unicode).compact = 0;
1173 _PyUnicode_STATE(unicode).ready = 0;
1174 _PyUnicode_STATE(unicode).ascii = 0;
1175 _PyUnicode_DATA_ANY(unicode) = NULL;
1176 _PyUnicode_LENGTH(unicode) = 0;
1177 _PyUnicode_UTF8(unicode) = NULL;
1178 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1181 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001182 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001183 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001184 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186
Jeremy Hyltond8082792003-09-16 19:41:39 +00001187 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001188 * the caller fails before initializing str -- unicode_resize()
1189 * reads str[0], and the Keep-Alive optimization can keep memory
1190 * allocated for str alive across a call to unicode_dealloc(unicode).
1191 * We don't want unicode_resize to read uninitialized memory in
1192 * that case.
1193 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 _PyUnicode_WSTR(unicode)[0] = 0;
1195 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001196
Victor Stinner7931d9a2011-11-04 00:22:48 +01001197 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 return unicode;
1199}
1200
Victor Stinnerf42dc442011-10-02 23:33:16 +02001201static const char*
1202unicode_kind_name(PyObject *unicode)
1203{
Victor Stinner42dfd712011-10-03 14:41:45 +02001204 /* don't check consistency: unicode_kind_name() is called from
1205 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001206 if (!PyUnicode_IS_COMPACT(unicode))
1207 {
1208 if (!PyUnicode_IS_READY(unicode))
1209 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001210 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001211 {
1212 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001213 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 return "legacy ascii";
1215 else
1216 return "legacy latin1";
1217 case PyUnicode_2BYTE_KIND:
1218 return "legacy UCS2";
1219 case PyUnicode_4BYTE_KIND:
1220 return "legacy UCS4";
1221 default:
1222 return "<legacy invalid kind>";
1223 }
1224 }
1225 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001226 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001227 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001228 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001229 return "ascii";
1230 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001231 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001232 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001233 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001234 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001235 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001236 default:
1237 return "<invalid compact kind>";
1238 }
1239}
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001243char *_PyUnicode_utf8(void *unicode_raw){
1244 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001245 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246}
1247
Victor Stinnera42de742018-11-22 10:25:22 +01001248void *_PyUnicode_compact_data(void *unicode_raw) {
1249 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 return _PyUnicode_COMPACT_DATA(unicode);
1251}
Victor Stinnera42de742018-11-22 10:25:22 +01001252void *_PyUnicode_data(void *unicode_raw) {
1253 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001254 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1256 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1257 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1258 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1259 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1260 return PyUnicode_DATA(unicode);
1261}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001262
1263void
1264_PyUnicode_Dump(PyObject *op)
1265{
1266 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001267 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1268 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1269 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001270
Victor Stinnera849a4b2011-10-03 12:12:11 +02001271 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001272 {
1273 if (ascii->state.ascii)
1274 data = (ascii + 1);
1275 else
1276 data = (compact + 1);
1277 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001278 else
1279 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001280 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1281 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001282
Victor Stinnera849a4b2011-10-03 12:12:11 +02001283 if (ascii->wstr == data)
1284 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001285 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001286
Victor Stinnera3b334d2011-10-03 13:53:37 +02001287 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001288 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001289 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1290 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001291 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001292 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001293 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001294 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001295}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#endif
1297
1298PyObject *
1299PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1300{
1301 PyObject *obj;
1302 PyCompactUnicodeObject *unicode;
1303 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001304 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001305 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 Py_ssize_t char_size;
1307 Py_ssize_t struct_size;
1308
1309 /* Optimization for empty strings */
1310 if (size == 0 && unicode_empty != NULL) {
1311 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001312 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 }
1314
Victor Stinner9e9d6892011-10-04 01:02:02 +02001315 is_ascii = 0;
1316 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 struct_size = sizeof(PyCompactUnicodeObject);
1318 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001319 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 char_size = 1;
1321 is_ascii = 1;
1322 struct_size = sizeof(PyASCIIObject);
1323 }
1324 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001325 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 char_size = 1;
1327 }
1328 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001329 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 char_size = 2;
1331 if (sizeof(wchar_t) == 2)
1332 is_sharing = 1;
1333 }
1334 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001335 if (maxchar > MAX_UNICODE) {
1336 PyErr_SetString(PyExc_SystemError,
1337 "invalid maximum character passed to PyUnicode_New");
1338 return NULL;
1339 }
Victor Stinner8f825062012-04-27 13:55:39 +02001340 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 char_size = 4;
1342 if (sizeof(wchar_t) == 4)
1343 is_sharing = 1;
1344 }
1345
1346 /* Ensure we won't overflow the size. */
1347 if (size < 0) {
1348 PyErr_SetString(PyExc_SystemError,
1349 "Negative size passed to PyUnicode_New");
1350 return NULL;
1351 }
1352 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1353 return PyErr_NoMemory();
1354
1355 /* Duplicated allocation code from _PyObject_New() instead of a call to
1356 * PyObject_New() so we are able to allocate space for the object and
1357 * it's data buffer.
1358 */
1359 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1360 if (obj == NULL)
1361 return PyErr_NoMemory();
1362 obj = PyObject_INIT(obj, &PyUnicode_Type);
1363 if (obj == NULL)
1364 return NULL;
1365
1366 unicode = (PyCompactUnicodeObject *)obj;
1367 if (is_ascii)
1368 data = ((PyASCIIObject*)obj) + 1;
1369 else
1370 data = unicode + 1;
1371 _PyUnicode_LENGTH(unicode) = size;
1372 _PyUnicode_HASH(unicode) = -1;
1373 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001374 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 _PyUnicode_STATE(unicode).compact = 1;
1376 _PyUnicode_STATE(unicode).ready = 1;
1377 _PyUnicode_STATE(unicode).ascii = is_ascii;
1378 if (is_ascii) {
1379 ((char*)data)[size] = 0;
1380 _PyUnicode_WSTR(unicode) = NULL;
1381 }
Victor Stinner8f825062012-04-27 13:55:39 +02001382 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 ((char*)data)[size] = 0;
1384 _PyUnicode_WSTR(unicode) = NULL;
1385 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001387 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 else {
1390 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001391 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001392 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001394 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 ((Py_UCS4*)data)[size] = 0;
1396 if (is_sharing) {
1397 _PyUnicode_WSTR_LENGTH(unicode) = size;
1398 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1399 }
1400 else {
1401 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1402 _PyUnicode_WSTR(unicode) = NULL;
1403 }
1404 }
Victor Stinner8f825062012-04-27 13:55:39 +02001405#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001406 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001407#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001408 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 return obj;
1410}
1411
1412#if SIZEOF_WCHAR_T == 2
1413/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1414 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001415 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416
1417 This function assumes that unicode can hold one more code point than wstr
1418 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001419static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001421 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422{
1423 const wchar_t *iter;
1424 Py_UCS4 *ucs4_out;
1425
Victor Stinner910337b2011-10-03 03:20:16 +02001426 assert(unicode != NULL);
1427 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1429 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1430
1431 for (iter = begin; iter < end; ) {
1432 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1433 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001434 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1435 && (iter+1) < end
1436 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 {
Victor Stinner551ac952011-11-29 22:58:13 +01001438 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 iter += 2;
1440 }
1441 else {
1442 *ucs4_out++ = *iter;
1443 iter++;
1444 }
1445 }
1446 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1447 _PyUnicode_GET_LENGTH(unicode)));
1448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449}
1450#endif
1451
Victor Stinnercd9950f2011-10-02 00:34:53 +02001452static int
Victor Stinner488fa492011-12-12 00:01:39 +01001453unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001454{
Victor Stinner488fa492011-12-12 00:01:39 +01001455 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001456 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001457 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001458 return -1;
1459 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001460 return 0;
1461}
1462
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001463static int
1464_copy_characters(PyObject *to, Py_ssize_t to_start,
1465 PyObject *from, Py_ssize_t from_start,
1466 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001468 unsigned int from_kind, to_kind;
1469 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
Victor Stinneree4544c2012-05-09 22:24:08 +02001471 assert(0 <= how_many);
1472 assert(0 <= from_start);
1473 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001474 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001475 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001476 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477
Victor Stinnerd3f08822012-05-29 12:57:52 +02001478 assert(PyUnicode_Check(to));
1479 assert(PyUnicode_IS_READY(to));
1480 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1481
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001482 if (how_many == 0)
1483 return 0;
1484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001486 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489
Victor Stinnerf1852262012-06-16 16:38:26 +02001490#ifdef Py_DEBUG
1491 if (!check_maxchar
1492 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1493 {
1494 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1495 Py_UCS4 ch;
1496 Py_ssize_t i;
1497 for (i=0; i < how_many; i++) {
1498 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1499 assert(ch <= to_maxchar);
1500 }
1501 }
1502#endif
1503
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001504 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001505 if (check_maxchar
1506 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1507 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001508 /* Writing Latin-1 characters into an ASCII string requires to
1509 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 Py_UCS4 max_char;
1511 max_char = ucs1lib_find_max_char(from_data,
1512 (Py_UCS1*)from_data + how_many);
1513 if (max_char >= 128)
1514 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 }
Christian Heimesf051e432016-09-13 20:22:02 +02001516 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001517 (char*)from_data + from_kind * from_start,
1518 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 else if (from_kind == PyUnicode_1BYTE_KIND
1521 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001522 {
1523 _PyUnicode_CONVERT_BYTES(
1524 Py_UCS1, Py_UCS2,
1525 PyUnicode_1BYTE_DATA(from) + from_start,
1526 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1527 PyUnicode_2BYTE_DATA(to) + to_start
1528 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001529 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001530 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001531 && to_kind == PyUnicode_4BYTE_KIND)
1532 {
1533 _PyUnicode_CONVERT_BYTES(
1534 Py_UCS1, Py_UCS4,
1535 PyUnicode_1BYTE_DATA(from) + from_start,
1536 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1537 PyUnicode_4BYTE_DATA(to) + to_start
1538 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001539 }
1540 else if (from_kind == PyUnicode_2BYTE_KIND
1541 && to_kind == PyUnicode_4BYTE_KIND)
1542 {
1543 _PyUnicode_CONVERT_BYTES(
1544 Py_UCS2, Py_UCS4,
1545 PyUnicode_2BYTE_DATA(from) + from_start,
1546 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1547 PyUnicode_4BYTE_DATA(to) + to_start
1548 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001549 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001550 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001551 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1552
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001553 if (!check_maxchar) {
1554 if (from_kind == PyUnicode_2BYTE_KIND
1555 && to_kind == PyUnicode_1BYTE_KIND)
1556 {
1557 _PyUnicode_CONVERT_BYTES(
1558 Py_UCS2, Py_UCS1,
1559 PyUnicode_2BYTE_DATA(from) + from_start,
1560 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1561 PyUnicode_1BYTE_DATA(to) + to_start
1562 );
1563 }
1564 else if (from_kind == PyUnicode_4BYTE_KIND
1565 && to_kind == PyUnicode_1BYTE_KIND)
1566 {
1567 _PyUnicode_CONVERT_BYTES(
1568 Py_UCS4, Py_UCS1,
1569 PyUnicode_4BYTE_DATA(from) + from_start,
1570 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1571 PyUnicode_1BYTE_DATA(to) + to_start
1572 );
1573 }
1574 else if (from_kind == PyUnicode_4BYTE_KIND
1575 && to_kind == PyUnicode_2BYTE_KIND)
1576 {
1577 _PyUnicode_CONVERT_BYTES(
1578 Py_UCS4, Py_UCS2,
1579 PyUnicode_4BYTE_DATA(from) + from_start,
1580 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1581 PyUnicode_2BYTE_DATA(to) + to_start
1582 );
1583 }
1584 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001585 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001586 }
1587 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001588 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001589 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001590 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001591 Py_ssize_t i;
1592
Victor Stinnera0702ab2011-09-29 14:14:38 +02001593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 if (ch > to_maxchar)
1596 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001597 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1598 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001599 }
1600 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601 return 0;
1602}
1603
Victor Stinnerd3f08822012-05-29 12:57:52 +02001604void
1605_PyUnicode_FastCopyCharacters(
1606 PyObject *to, Py_ssize_t to_start,
1607 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608{
1609 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1610}
1611
1612Py_ssize_t
1613PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1614 PyObject *from, Py_ssize_t from_start,
1615 Py_ssize_t how_many)
1616{
1617 int err;
1618
1619 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1620 PyErr_BadInternalCall();
1621 return -1;
1622 }
1623
Benjamin Petersonbac79492012-01-14 13:34:47 -05001624 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001625 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001626 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001627 return -1;
1628
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001629 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001630 PyErr_SetString(PyExc_IndexError, "string index out of range");
1631 return -1;
1632 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001633 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001634 PyErr_SetString(PyExc_IndexError, "string index out of range");
1635 return -1;
1636 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001637 if (how_many < 0) {
1638 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1639 return -1;
1640 }
1641 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001642 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1643 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001644 "Cannot write %zi characters at %zi "
1645 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001646 how_many, to_start, PyUnicode_GET_LENGTH(to));
1647 return -1;
1648 }
1649
1650 if (how_many == 0)
1651 return 0;
1652
Victor Stinner488fa492011-12-12 00:01:39 +01001653 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001654 return -1;
1655
1656 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1657 if (err) {
1658 PyErr_Format(PyExc_SystemError,
1659 "Cannot copy %s characters "
1660 "into a string of %s characters",
1661 unicode_kind_name(from),
1662 unicode_kind_name(to));
1663 return -1;
1664 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001665 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666}
1667
Victor Stinner17222162011-09-28 22:15:37 +02001668/* Find the maximum code point and count the number of surrogate pairs so a
1669 correct string length can be computed before converting a string to UCS4.
1670 This function counts single surrogates as a character and not as a pair.
1671
1672 Return 0 on success, or -1 on error. */
1673static int
1674find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1675 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676{
1677 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001678 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679
Victor Stinnerc53be962011-10-02 21:33:54 +02001680 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 *num_surrogates = 0;
1682 *maxchar = 0;
1683
1684 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001686 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1687 && (iter+1) < end
1688 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1689 {
1690 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1691 ++(*num_surrogates);
1692 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001696 {
1697 ch = *iter;
1698 iter++;
1699 }
1700 if (ch > *maxchar) {
1701 *maxchar = ch;
1702 if (*maxchar > MAX_UNICODE) {
1703 PyErr_Format(PyExc_ValueError,
1704 "character U+%x is not in range [U+0000; U+10ffff]",
1705 ch);
1706 return -1;
1707 }
1708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 }
1710 return 0;
1711}
1712
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001713int
1714_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715{
1716 wchar_t *end;
1717 Py_UCS4 maxchar = 0;
1718 Py_ssize_t num_surrogates;
1719#if SIZEOF_WCHAR_T == 2
1720 Py_ssize_t length_wo_surrogates;
1721#endif
1722
Georg Brandl7597add2011-10-05 16:36:47 +02001723 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001724 strings were created using _PyObject_New() and where no canonical
1725 representation (the str field) has been set yet aka strings
1726 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001727 assert(_PyUnicode_CHECK(unicode));
1728 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001730 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001731 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001732 /* Actually, it should neither be interned nor be anything else: */
1733 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001736 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001737 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739
1740 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001741 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1742 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 PyErr_NoMemory();
1744 return -1;
1745 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001746 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 _PyUnicode_WSTR(unicode), end,
1748 PyUnicode_1BYTE_DATA(unicode));
1749 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1750 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1751 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1752 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001753 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001754 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001755 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 }
1757 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001758 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001759 _PyUnicode_UTF8(unicode) = NULL;
1760 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 }
1762 PyObject_FREE(_PyUnicode_WSTR(unicode));
1763 _PyUnicode_WSTR(unicode) = NULL;
1764 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1765 }
1766 /* In this case we might have to convert down from 4-byte native
1767 wchar_t to 2-byte unicode. */
1768 else if (maxchar < 65536) {
1769 assert(num_surrogates == 0 &&
1770 "FindMaxCharAndNumSurrogatePairs() messed up");
1771
Victor Stinner506f5922011-09-28 22:34:18 +02001772#if SIZEOF_WCHAR_T == 2
1773 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001775 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1776 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1777 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001778 _PyUnicode_UTF8(unicode) = NULL;
1779 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001780#else
1781 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001782 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001783 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001784 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001785 PyErr_NoMemory();
1786 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 }
Victor Stinner506f5922011-09-28 22:34:18 +02001788 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1789 _PyUnicode_WSTR(unicode), end,
1790 PyUnicode_2BYTE_DATA(unicode));
1791 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1793 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001796 PyObject_FREE(_PyUnicode_WSTR(unicode));
1797 _PyUnicode_WSTR(unicode) = NULL;
1798 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1799#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 }
1801 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1802 else {
1803#if SIZEOF_WCHAR_T == 2
1804 /* in case the native representation is 2-bytes, we need to allocate a
1805 new normalized 4-byte version. */
1806 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001807 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1808 PyErr_NoMemory();
1809 return -1;
1810 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001811 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1812 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 PyErr_NoMemory();
1814 return -1;
1815 }
1816 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1817 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001818 _PyUnicode_UTF8(unicode) = NULL;
1819 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001820 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1821 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001822 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 PyObject_FREE(_PyUnicode_WSTR(unicode));
1824 _PyUnicode_WSTR(unicode) = NULL;
1825 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1826#else
1827 assert(num_surrogates == 0);
1828
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 _PyUnicode_UTF8(unicode) = NULL;
1832 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1834#endif
1835 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1836 }
1837 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001838 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 return 0;
1840}
1841
Alexander Belopolsky40018472011-02-26 01:02:56 +00001842static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001843unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844{
Walter Dörwald16807132007-05-25 13:52:07 +00001845 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 case SSTATE_NOT_INTERNED:
1847 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001848
Benjamin Peterson29060642009-01-31 22:14:21 +00001849 case SSTATE_INTERNED_MORTAL:
1850 /* revive dead object temporarily for DelItem */
1851 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001852 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001853 Py_FatalError(
1854 "deletion of interned string failed");
1855 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001856
Benjamin Peterson29060642009-01-31 22:14:21 +00001857 case SSTATE_INTERNED_IMMORTAL:
1858 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001859 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001860
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 default:
1862 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001863 }
1864
Victor Stinner03490912011-10-03 23:45:12 +02001865 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001867 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001868 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1870 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001872 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873}
1874
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001875#ifdef Py_DEBUG
1876static int
1877unicode_is_singleton(PyObject *unicode)
1878{
1879 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1880 if (unicode == unicode_empty)
1881 return 1;
1882 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1883 {
1884 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1885 if (ch < 256 && unicode_latin1[ch] == unicode)
1886 return 1;
1887 }
1888 return 0;
1889}
1890#endif
1891
Alexander Belopolsky40018472011-02-26 01:02:56 +00001892static int
Victor Stinner488fa492011-12-12 00:01:39 +01001893unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001894{
Victor Stinner488fa492011-12-12 00:01:39 +01001895 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001896 if (Py_REFCNT(unicode) != 1)
1897 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001898 if (_PyUnicode_HASH(unicode) != -1)
1899 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001900 if (PyUnicode_CHECK_INTERNED(unicode))
1901 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001902 if (!PyUnicode_CheckExact(unicode))
1903 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001904#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001905 /* singleton refcount is greater than 1 */
1906 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001907#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001908 return 1;
1909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Victor Stinnerfe226c02011-10-03 03:52:20 +02001911static int
1912unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1913{
1914 PyObject *unicode;
1915 Py_ssize_t old_length;
1916
1917 assert(p_unicode != NULL);
1918 unicode = *p_unicode;
1919
1920 assert(unicode != NULL);
1921 assert(PyUnicode_Check(unicode));
1922 assert(0 <= length);
1923
Victor Stinner910337b2011-10-03 03:20:16 +02001924 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001925 old_length = PyUnicode_WSTR_LENGTH(unicode);
1926 else
1927 old_length = PyUnicode_GET_LENGTH(unicode);
1928 if (old_length == length)
1929 return 0;
1930
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001931 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001932 _Py_INCREF_UNICODE_EMPTY();
1933 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001935 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001936 return 0;
1937 }
1938
Victor Stinner488fa492011-12-12 00:01:39 +01001939 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001940 PyObject *copy = resize_copy(unicode, length);
1941 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001943 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001945 }
1946
Victor Stinnerfe226c02011-10-03 03:52:20 +02001947 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001948 PyObject *new_unicode = resize_compact(unicode, length);
1949 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001950 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001951 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001953 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001954 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001955}
1956
Alexander Belopolsky40018472011-02-26 01:02:56 +00001957int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001958PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001959{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001960 PyObject *unicode;
1961 if (p_unicode == NULL) {
1962 PyErr_BadInternalCall();
1963 return -1;
1964 }
1965 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001966 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 {
1968 PyErr_BadInternalCall();
1969 return -1;
1970 }
1971 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001972}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001973
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001974/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001975
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001976 WARNING: The function doesn't copy the terminating null character and
1977 doesn't check the maximum character (may write a latin1 character in an
1978 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001979static void
1980unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1981 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001982{
1983 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1984 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001985 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001986
1987 switch (kind) {
1988 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001989 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001990#ifdef Py_DEBUG
1991 if (PyUnicode_IS_ASCII(unicode)) {
1992 Py_UCS4 maxchar = ucs1lib_find_max_char(
1993 (const Py_UCS1*)str,
1994 (const Py_UCS1*)str + len);
1995 assert(maxchar < 128);
1996 }
1997#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001998 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001999 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002000 }
2001 case PyUnicode_2BYTE_KIND: {
2002 Py_UCS2 *start = (Py_UCS2 *)data + index;
2003 Py_UCS2 *ucs2 = start;
2004 assert(index <= PyUnicode_GET_LENGTH(unicode));
2005
Victor Stinner184252a2012-06-16 02:57:41 +02002006 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002007 *ucs2 = (Py_UCS2)*str;
2008
2009 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002010 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002011 }
2012 default: {
2013 Py_UCS4 *start = (Py_UCS4 *)data + index;
2014 Py_UCS4 *ucs4 = start;
2015 assert(kind == PyUnicode_4BYTE_KIND);
2016 assert(index <= PyUnicode_GET_LENGTH(unicode));
2017
Victor Stinner184252a2012-06-16 02:57:41 +02002018 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002019 *ucs4 = (Py_UCS4)*str;
2020
2021 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002022 }
2023 }
2024}
2025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026static PyObject*
2027get_latin1_char(unsigned char ch)
2028{
Victor Stinnera464fc12011-10-02 20:39:30 +02002029 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002031 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (!unicode)
2033 return NULL;
2034 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002035 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 unicode_latin1[ch] = unicode;
2037 }
2038 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002039 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040}
2041
Victor Stinner985a82a2014-01-03 12:53:47 +01002042static PyObject*
2043unicode_char(Py_UCS4 ch)
2044{
2045 PyObject *unicode;
2046
2047 assert(ch <= MAX_UNICODE);
2048
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002049 if (ch < 256)
2050 return get_latin1_char(ch);
2051
Victor Stinner985a82a2014-01-03 12:53:47 +01002052 unicode = PyUnicode_New(1, ch);
2053 if (unicode == NULL)
2054 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002055
2056 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2057 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002058 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002059 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002060 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2061 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2062 }
2063 assert(_PyUnicode_CheckConsistency(unicode, 1));
2064 return unicode;
2065}
2066
Alexander Belopolsky40018472011-02-26 01:02:56 +00002067PyObject *
2068PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002070 if (u == NULL)
2071 return (PyObject*)_PyUnicode_New(size);
2072
2073 if (size < 0) {
2074 PyErr_BadInternalCall();
2075 return NULL;
2076 }
2077
2078 return PyUnicode_FromWideChar(u, size);
2079}
2080
2081PyObject *
2082PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2083{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002084 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 Py_UCS4 maxchar = 0;
2086 Py_ssize_t num_surrogates;
2087
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002088 if (u == NULL && size != 0) {
2089 PyErr_BadInternalCall();
2090 return NULL;
2091 }
2092
2093 if (size == -1) {
2094 size = wcslen(u);
2095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002097 /* If the Unicode data is known at construction time, we can apply
2098 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002101 if (size == 0)
2102 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 /* Single character Unicode objects in the Latin-1 range are
2105 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002106 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 return get_latin1_char((unsigned char)*u);
2108
2109 /* If not empty and not single character, copy the Unicode data
2110 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002111 if (find_maxchar_surrogates(u, u + size,
2112 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 return NULL;
2114
Victor Stinner8faf8212011-12-08 22:14:11 +01002115 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 if (!unicode)
2117 return NULL;
2118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 switch (PyUnicode_KIND(unicode)) {
2120 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002121 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2123 break;
2124 case PyUnicode_2BYTE_KIND:
2125#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002126 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002128 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2130#endif
2131 break;
2132 case PyUnicode_4BYTE_KIND:
2133#if SIZEOF_WCHAR_T == 2
2134 /* This is the only case which has to process surrogates, thus
2135 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002136 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137#else
2138 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002139 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140#endif
2141 break;
2142 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002143 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002146 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147}
2148
Alexander Belopolsky40018472011-02-26 01:02:56 +00002149PyObject *
2150PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002151{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 if (size < 0) {
2153 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002154 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 return NULL;
2156 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002157 if (u != NULL)
2158 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2159 else
2160 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002161}
2162
Alexander Belopolsky40018472011-02-26 01:02:56 +00002163PyObject *
2164PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002165{
2166 size_t size = strlen(u);
2167 if (size > PY_SSIZE_T_MAX) {
2168 PyErr_SetString(PyExc_OverflowError, "input too long");
2169 return NULL;
2170 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002171 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002172}
2173
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002174PyObject *
2175_PyUnicode_FromId(_Py_Identifier *id)
2176{
2177 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002178 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2179 strlen(id->string),
2180 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002181 if (!id->object)
2182 return NULL;
2183 PyUnicode_InternInPlace(&id->object);
2184 assert(!id->next);
2185 id->next = static_strings;
2186 static_strings = id;
2187 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002188 return id->object;
2189}
2190
2191void
2192_PyUnicode_ClearStaticStrings()
2193{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002194 _Py_Identifier *tmp, *s = static_strings;
2195 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002196 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002197 tmp = s->next;
2198 s->next = NULL;
2199 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002200 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002201 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002202}
2203
Benjamin Peterson0df54292012-03-26 14:50:32 -04002204/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205
Victor Stinnerd3f08822012-05-29 12:57:52 +02002206PyObject*
2207_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002208{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002209 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002210 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002211 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002212#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002213 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002214#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002215 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002216 }
Victor Stinner785938e2011-12-11 20:09:03 +01002217 unicode = PyUnicode_New(size, 127);
2218 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002219 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002220 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2221 assert(_PyUnicode_CheckConsistency(unicode, 1));
2222 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002223}
2224
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002225static Py_UCS4
2226kind_maxchar_limit(unsigned int kind)
2227{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002228 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002229 case PyUnicode_1BYTE_KIND:
2230 return 0x80;
2231 case PyUnicode_2BYTE_KIND:
2232 return 0x100;
2233 case PyUnicode_4BYTE_KIND:
2234 return 0x10000;
2235 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002236 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002237 }
2238}
2239
Victor Stinner702c7342011-10-05 13:50:52 +02002240static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002241_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002244 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245
Serhiy Storchaka678db842013-01-26 12:16:36 +02002246 if (size == 0)
2247 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002248 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002249 if (size == 1)
2250 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002251
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002252 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002253 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 if (!res)
2255 return NULL;
2256 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002257 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002259}
2260
Victor Stinnere57b1c02011-09-28 22:20:48 +02002261static PyObject*
2262_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263{
2264 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002265 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002266
Serhiy Storchaka678db842013-01-26 12:16:36 +02002267 if (size == 0)
2268 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002269 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002270 if (size == 1)
2271 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002272
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002273 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002274 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 if (!res)
2276 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002277 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002279 else {
2280 _PyUnicode_CONVERT_BYTES(
2281 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2282 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002283 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 return res;
2285}
2286
Victor Stinnere57b1c02011-09-28 22:20:48 +02002287static PyObject*
2288_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289{
2290 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002291 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292
Serhiy Storchaka678db842013-01-26 12:16:36 +02002293 if (size == 0)
2294 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002295 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002296 if (size == 1)
2297 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002298
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002300 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 if (!res)
2302 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002303 if (max_char < 256)
2304 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2305 PyUnicode_1BYTE_DATA(res));
2306 else if (max_char < 0x10000)
2307 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2308 PyUnicode_2BYTE_DATA(res));
2309 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002311 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 return res;
2313}
2314
2315PyObject*
2316PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2317{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002318 if (size < 0) {
2319 PyErr_SetString(PyExc_ValueError, "size must be positive");
2320 return NULL;
2321 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002322 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002324 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002326 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002328 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002329 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002330 PyErr_SetString(PyExc_SystemError, "invalid kind");
2331 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333}
2334
Victor Stinnerece58de2012-04-23 23:36:38 +02002335Py_UCS4
2336_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2337{
2338 enum PyUnicode_Kind kind;
2339 void *startptr, *endptr;
2340
2341 assert(PyUnicode_IS_READY(unicode));
2342 assert(0 <= start);
2343 assert(end <= PyUnicode_GET_LENGTH(unicode));
2344 assert(start <= end);
2345
2346 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2347 return PyUnicode_MAX_CHAR_VALUE(unicode);
2348
2349 if (start == end)
2350 return 127;
2351
Victor Stinner94d558b2012-04-27 22:26:58 +02002352 if (PyUnicode_IS_ASCII(unicode))
2353 return 127;
2354
Victor Stinnerece58de2012-04-23 23:36:38 +02002355 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002356 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002357 endptr = (char *)startptr + end * kind;
2358 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002359 switch(kind) {
2360 case PyUnicode_1BYTE_KIND:
2361 return ucs1lib_find_max_char(startptr, endptr);
2362 case PyUnicode_2BYTE_KIND:
2363 return ucs2lib_find_max_char(startptr, endptr);
2364 case PyUnicode_4BYTE_KIND:
2365 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002366 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002367 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002368 }
2369}
2370
Victor Stinner25a4b292011-10-06 12:31:55 +02002371/* Ensure that a string uses the most efficient storage, if it is not the
2372 case: create a new string with of the right kind. Write NULL into *p_unicode
2373 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002374static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002375unicode_adjust_maxchar(PyObject **p_unicode)
2376{
2377 PyObject *unicode, *copy;
2378 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002379 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002380 unsigned int kind;
2381
2382 assert(p_unicode != NULL);
2383 unicode = *p_unicode;
2384 assert(PyUnicode_IS_READY(unicode));
2385 if (PyUnicode_IS_ASCII(unicode))
2386 return;
2387
2388 len = PyUnicode_GET_LENGTH(unicode);
2389 kind = PyUnicode_KIND(unicode);
2390 if (kind == PyUnicode_1BYTE_KIND) {
2391 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002392 max_char = ucs1lib_find_max_char(u, u + len);
2393 if (max_char >= 128)
2394 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002395 }
2396 else if (kind == PyUnicode_2BYTE_KIND) {
2397 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002398 max_char = ucs2lib_find_max_char(u, u + len);
2399 if (max_char >= 256)
2400 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002401 }
2402 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002403 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002404 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002405 max_char = ucs4lib_find_max_char(u, u + len);
2406 if (max_char >= 0x10000)
2407 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002408 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002409 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002410 if (copy != NULL)
2411 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002412 Py_DECREF(unicode);
2413 *p_unicode = copy;
2414}
2415
Victor Stinner034f6cf2011-09-30 02:26:44 +02002416PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002417_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002418{
Victor Stinner87af4f22011-11-21 23:03:47 +01002419 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002420 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002421
Victor Stinner034f6cf2011-09-30 02:26:44 +02002422 if (!PyUnicode_Check(unicode)) {
2423 PyErr_BadInternalCall();
2424 return NULL;
2425 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002426 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002427 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002428
Victor Stinner87af4f22011-11-21 23:03:47 +01002429 length = PyUnicode_GET_LENGTH(unicode);
2430 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002431 if (!copy)
2432 return NULL;
2433 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2434
Christian Heimesf051e432016-09-13 20:22:02 +02002435 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002436 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002437 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002438 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002439}
2440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441
Victor Stinnerbc603d12011-10-02 01:00:40 +02002442/* Widen Unicode objects to larger buffers. Don't write terminating null
2443 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444
2445void*
2446_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2447{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002448 Py_ssize_t len;
2449 void *result;
2450 unsigned int skind;
2451
Benjamin Petersonbac79492012-01-14 13:34:47 -05002452 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002453 return NULL;
2454
2455 len = PyUnicode_GET_LENGTH(s);
2456 skind = PyUnicode_KIND(s);
2457 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002461 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002462 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002463 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002464 if (!result)
2465 return PyErr_NoMemory();
2466 assert(skind == PyUnicode_1BYTE_KIND);
2467 _PyUnicode_CONVERT_BYTES(
2468 Py_UCS1, Py_UCS2,
2469 PyUnicode_1BYTE_DATA(s),
2470 PyUnicode_1BYTE_DATA(s) + len,
2471 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002473 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002474 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002475 if (!result)
2476 return PyErr_NoMemory();
2477 if (skind == PyUnicode_2BYTE_KIND) {
2478 _PyUnicode_CONVERT_BYTES(
2479 Py_UCS2, Py_UCS4,
2480 PyUnicode_2BYTE_DATA(s),
2481 PyUnicode_2BYTE_DATA(s) + len,
2482 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002484 else {
2485 assert(skind == PyUnicode_1BYTE_KIND);
2486 _PyUnicode_CONVERT_BYTES(
2487 Py_UCS1, Py_UCS4,
2488 PyUnicode_1BYTE_DATA(s),
2489 PyUnicode_1BYTE_DATA(s) + len,
2490 result);
2491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002493 default:
2494 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 }
Victor Stinner01698042011-10-04 00:04:26 +02002496 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 return NULL;
2498}
2499
2500static Py_UCS4*
2501as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
2504 int kind;
2505 void *data;
2506 Py_ssize_t len, targetlen;
2507 if (PyUnicode_READY(string) == -1)
2508 return NULL;
2509 kind = PyUnicode_KIND(string);
2510 data = PyUnicode_DATA(string);
2511 len = PyUnicode_GET_LENGTH(string);
2512 targetlen = len;
2513 if (copy_null)
2514 targetlen++;
2515 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002516 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 if (!target) {
2518 PyErr_NoMemory();
2519 return NULL;
2520 }
2521 }
2522 else {
2523 if (targetsize < targetlen) {
2524 PyErr_Format(PyExc_SystemError,
2525 "string is longer than the buffer");
2526 if (copy_null && 0 < targetsize)
2527 target[0] = 0;
2528 return NULL;
2529 }
2530 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002531 if (kind == PyUnicode_1BYTE_KIND) {
2532 Py_UCS1 *start = (Py_UCS1 *) data;
2533 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002535 else if (kind == PyUnicode_2BYTE_KIND) {
2536 Py_UCS2 *start = (Py_UCS2 *) data;
2537 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2538 }
2539 else {
2540 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002541 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 if (copy_null)
2544 target[len] = 0;
2545 return target;
2546}
2547
2548Py_UCS4*
2549PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2550 int copy_null)
2551{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002552 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 PyErr_BadInternalCall();
2554 return NULL;
2555 }
2556 return as_ucs4(string, target, targetsize, copy_null);
2557}
2558
2559Py_UCS4*
2560PyUnicode_AsUCS4Copy(PyObject *string)
2561{
2562 return as_ucs4(string, NULL, 0, 1);
2563}
2564
Victor Stinner15a11362012-10-06 23:48:20 +02002565/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002566 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2567 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2568#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002569
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002570static int
2571unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2572 Py_ssize_t width, Py_ssize_t precision)
2573{
2574 Py_ssize_t length, fill, arglen;
2575 Py_UCS4 maxchar;
2576
2577 if (PyUnicode_READY(str) == -1)
2578 return -1;
2579
2580 length = PyUnicode_GET_LENGTH(str);
2581 if ((precision == -1 || precision >= length)
2582 && width <= length)
2583 return _PyUnicodeWriter_WriteStr(writer, str);
2584
2585 if (precision != -1)
2586 length = Py_MIN(precision, length);
2587
2588 arglen = Py_MAX(length, width);
2589 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2590 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2591 else
2592 maxchar = writer->maxchar;
2593
2594 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2595 return -1;
2596
2597 if (width > length) {
2598 fill = width - length;
2599 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2600 return -1;
2601 writer->pos += fill;
2602 }
2603
2604 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2605 str, 0, length);
2606 writer->pos += length;
2607 return 0;
2608}
2609
2610static int
Victor Stinner998b8062018-09-12 00:23:25 +02002611unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002612 Py_ssize_t width, Py_ssize_t precision)
2613{
2614 /* UTF-8 */
2615 Py_ssize_t length;
2616 PyObject *unicode;
2617 int res;
2618
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002619 if (precision == -1) {
2620 length = strlen(str);
2621 }
2622 else {
2623 length = 0;
2624 while (length < precision && str[length]) {
2625 length++;
2626 }
2627 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2629 if (unicode == NULL)
2630 return -1;
2631
2632 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2633 Py_DECREF(unicode);
2634 return res;
2635}
2636
Victor Stinner96865452011-03-01 23:44:09 +00002637static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002638unicode_fromformat_arg(_PyUnicodeWriter *writer,
2639 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002640{
Victor Stinnere215d962012-10-06 23:03:36 +02002641 const char *p;
2642 Py_ssize_t len;
2643 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 Py_ssize_t width;
2645 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 int longflag;
2647 int longlongflag;
2648 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002649 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002650
2651 p = f;
2652 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002653 zeropad = 0;
2654 if (*f == '0') {
2655 zeropad = 1;
2656 f++;
2657 }
Victor Stinner96865452011-03-01 23:44:09 +00002658
2659 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 width = -1;
2661 if (Py_ISDIGIT((unsigned)*f)) {
2662 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002663 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002664 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002665 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002666 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002668 return NULL;
2669 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002670 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002671 f++;
2672 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002673 }
2674 precision = -1;
2675 if (*f == '.') {
2676 f++;
2677 if (Py_ISDIGIT((unsigned)*f)) {
2678 precision = (*f - '0');
2679 f++;
2680 while (Py_ISDIGIT((unsigned)*f)) {
2681 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2682 PyErr_SetString(PyExc_ValueError,
2683 "precision too big");
2684 return NULL;
2685 }
2686 precision = (precision * 10) + (*f - '0');
2687 f++;
2688 }
2689 }
Victor Stinner96865452011-03-01 23:44:09 +00002690 if (*f == '%') {
2691 /* "%.3%s" => f points to "3" */
2692 f--;
2693 }
2694 }
2695 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002697 f--;
2698 }
Victor Stinner96865452011-03-01 23:44:09 +00002699
2700 /* Handle %ld, %lu, %lld and %llu. */
2701 longflag = 0;
2702 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002703 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002704 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002705 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002706 longflag = 1;
2707 ++f;
2708 }
Victor Stinner96865452011-03-01 23:44:09 +00002709 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002710 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002711 longlongflag = 1;
2712 f += 2;
2713 }
Victor Stinner96865452011-03-01 23:44:09 +00002714 }
2715 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002716 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002717 size_tflag = 1;
2718 ++f;
2719 }
Victor Stinnere215d962012-10-06 23:03:36 +02002720
2721 if (f[1] == '\0')
2722 writer->overallocate = 0;
2723
2724 switch (*f) {
2725 case 'c':
2726 {
2727 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002728 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002729 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002730 "character argument not in range(0x110000)");
2731 return NULL;
2732 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002733 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002734 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 break;
2736 }
2737
2738 case 'i':
2739 case 'd':
2740 case 'u':
2741 case 'x':
2742 {
2743 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002744 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002746
2747 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002748 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002749 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002750 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002751 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002752 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002753 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002754 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002755 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002756 va_arg(*vargs, size_t));
2757 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002758 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002759 va_arg(*vargs, unsigned int));
2760 }
2761 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002762 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002763 }
2764 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002765 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002766 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002767 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002768 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002769 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002770 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002771 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002772 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002773 va_arg(*vargs, Py_ssize_t));
2774 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002775 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002776 va_arg(*vargs, int));
2777 }
2778 assert(len >= 0);
2779
Victor Stinnere215d962012-10-06 23:03:36 +02002780 if (precision < len)
2781 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002782
2783 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2785 return NULL;
2786
Victor Stinnere215d962012-10-06 23:03:36 +02002787 if (width > precision) {
2788 Py_UCS4 fillchar;
2789 fill = width - precision;
2790 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002791 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2792 return NULL;
2793 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 }
Victor Stinner15a11362012-10-06 23:48:20 +02002795 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002796 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002797 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2798 return NULL;
2799 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002800 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801
Victor Stinner4a587072013-11-19 12:54:53 +01002802 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2803 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002804 break;
2805 }
2806
2807 case 'p':
2808 {
2809 char number[MAX_LONG_LONG_CHARS];
2810
2811 len = sprintf(number, "%p", va_arg(*vargs, void*));
2812 assert(len >= 0);
2813
2814 /* %p is ill-defined: ensure leading 0x. */
2815 if (number[1] == 'X')
2816 number[1] = 'x';
2817 else if (number[1] != 'x') {
2818 memmove(number + 2, number,
2819 strlen(number) + 1);
2820 number[0] = '0';
2821 number[1] = 'x';
2822 len += 2;
2823 }
2824
Victor Stinner4a587072013-11-19 12:54:53 +01002825 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002826 return NULL;
2827 break;
2828 }
2829
2830 case 's':
2831 {
2832 /* UTF-8 */
2833 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002834 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002835 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002836 break;
2837 }
2838
2839 case 'U':
2840 {
2841 PyObject *obj = va_arg(*vargs, PyObject *);
2842 assert(obj && _PyUnicode_CHECK(obj));
2843
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002844 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002845 return NULL;
2846 break;
2847 }
2848
2849 case 'V':
2850 {
2851 PyObject *obj = va_arg(*vargs, PyObject *);
2852 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002853 if (obj) {
2854 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002855 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002856 return NULL;
2857 }
2858 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002859 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002860 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002862 }
2863 break;
2864 }
2865
2866 case 'S':
2867 {
2868 PyObject *obj = va_arg(*vargs, PyObject *);
2869 PyObject *str;
2870 assert(obj);
2871 str = PyObject_Str(obj);
2872 if (!str)
2873 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002874 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002875 Py_DECREF(str);
2876 return NULL;
2877 }
2878 Py_DECREF(str);
2879 break;
2880 }
2881
2882 case 'R':
2883 {
2884 PyObject *obj = va_arg(*vargs, PyObject *);
2885 PyObject *repr;
2886 assert(obj);
2887 repr = PyObject_Repr(obj);
2888 if (!repr)
2889 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002890 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 Py_DECREF(repr);
2892 return NULL;
2893 }
2894 Py_DECREF(repr);
2895 break;
2896 }
2897
2898 case 'A':
2899 {
2900 PyObject *obj = va_arg(*vargs, PyObject *);
2901 PyObject *ascii;
2902 assert(obj);
2903 ascii = PyObject_ASCII(obj);
2904 if (!ascii)
2905 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002906 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002907 Py_DECREF(ascii);
2908 return NULL;
2909 }
2910 Py_DECREF(ascii);
2911 break;
2912 }
2913
2914 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002915 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002916 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 break;
2918
2919 default:
2920 /* if we stumble upon an unknown formatting code, copy the rest
2921 of the format string to the output string. (we cannot just
2922 skip the code, since there's no way to know what's in the
2923 argument list) */
2924 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002925 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002926 return NULL;
2927 f = p+len;
2928 return f;
2929 }
2930
2931 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002932 return f;
2933}
2934
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935PyObject *
2936PyUnicode_FromFormatV(const char *format, va_list vargs)
2937{
Victor Stinnere215d962012-10-06 23:03:36 +02002938 va_list vargs2;
2939 const char *f;
2940 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941
Victor Stinner8f674cc2013-04-17 23:02:17 +02002942 _PyUnicodeWriter_Init(&writer);
2943 writer.min_length = strlen(format) + 100;
2944 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002945
Benjamin Peterson0c212142016-09-20 20:39:33 -07002946 // Copy varags to be able to pass a reference to a subfunction.
2947 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002948
2949 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002950 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 f = unicode_fromformat_arg(&writer, f, &vargs2);
2952 if (f == NULL)
2953 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002956 const char *p;
2957 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002958
Victor Stinnere215d962012-10-06 23:03:36 +02002959 p = f;
2960 do
2961 {
2962 if ((unsigned char)*p > 127) {
2963 PyErr_Format(PyExc_ValueError,
2964 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2965 "string, got a non-ASCII byte: 0x%02x",
2966 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002967 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002968 }
2969 p++;
2970 }
2971 while (*p != '\0' && *p != '%');
2972 len = p - f;
2973
2974 if (*p == '\0')
2975 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002976
2977 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002978 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002979
2980 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002982 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002983 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002984 return _PyUnicodeWriter_Finish(&writer);
2985
2986 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002987 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002988 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002989 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002990}
2991
Walter Dörwaldd2034312007-05-18 16:29:38 +00002992PyObject *
2993PyUnicode_FromFormat(const char *format, ...)
2994{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002995 PyObject* ret;
2996 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002997
2998#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002999 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003000#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003001 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003003 ret = PyUnicode_FromFormatV(format, vargs);
3004 va_end(vargs);
3005 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003006}
3007
Serhiy Storchakac46db922018-10-23 22:58:24 +03003008static Py_ssize_t
3009unicode_get_widechar_size(PyObject *unicode)
3010{
3011 Py_ssize_t res;
3012
3013 assert(unicode != NULL);
3014 assert(_PyUnicode_CHECK(unicode));
3015
3016 if (_PyUnicode_WSTR(unicode) != NULL) {
3017 return PyUnicode_WSTR_LENGTH(unicode);
3018 }
3019 assert(PyUnicode_IS_READY(unicode));
3020
3021 res = _PyUnicode_LENGTH(unicode);
3022#if SIZEOF_WCHAR_T == 2
3023 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3024 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3025 const Py_UCS4 *end = s + res;
3026 for (; s < end; ++s) {
3027 if (*s > 0xFFFF) {
3028 ++res;
3029 }
3030 }
3031 }
3032#endif
3033 return res;
3034}
3035
3036static void
3037unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3038{
3039 const wchar_t *wstr;
3040
3041 assert(unicode != NULL);
3042 assert(_PyUnicode_CHECK(unicode));
3043
3044 wstr = _PyUnicode_WSTR(unicode);
3045 if (wstr != NULL) {
3046 memcpy(w, wstr, size * sizeof(wchar_t));
3047 return;
3048 }
3049 assert(PyUnicode_IS_READY(unicode));
3050
3051 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3052 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3053 for (; size--; ++s, ++w) {
3054 *w = *s;
3055 }
3056 }
3057 else {
3058#if SIZEOF_WCHAR_T == 4
3059 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3060 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3061 for (; size--; ++s, ++w) {
3062 *w = *s;
3063 }
3064#else
3065 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3066 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3067 for (; size--; ++s, ++w) {
3068 Py_UCS4 ch = *s;
3069 if (ch > 0xFFFF) {
3070 assert(ch <= MAX_UNICODE);
3071 /* encode surrogate pair in this case */
3072 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3073 if (!size--)
3074 break;
3075 *w = Py_UNICODE_LOW_SURROGATE(ch);
3076 }
3077 else {
3078 *w = ch;
3079 }
3080 }
3081#endif
3082 }
3083}
3084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003085#ifdef HAVE_WCHAR_H
3086
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003087/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003088
Victor Stinnerd88d9832011-09-06 02:00:05 +02003089 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003090 character) required to convert the unicode object. Ignore size argument.
3091
Victor Stinnerd88d9832011-09-06 02:00:05 +02003092 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003093 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003094 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003095Py_ssize_t
3096PyUnicode_AsWideChar(PyObject *unicode,
3097 wchar_t *w,
3098 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003099{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003100 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003101
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003102 if (unicode == NULL) {
3103 PyErr_BadInternalCall();
3104 return -1;
3105 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003106 if (!PyUnicode_Check(unicode)) {
3107 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003109 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003110
3111 res = unicode_get_widechar_size(unicode);
3112 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003113 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003114 }
3115
3116 if (size > res) {
3117 size = res + 1;
3118 }
3119 else {
3120 res = size;
3121 }
3122 unicode_copy_as_widechar(unicode, w, size);
3123 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003124}
3125
Victor Stinner137c34c2010-09-29 10:25:54 +00003126wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003127PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003128 Py_ssize_t *size)
3129{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003130 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003131 Py_ssize_t buflen;
3132
3133 if (unicode == NULL) {
3134 PyErr_BadInternalCall();
3135 return NULL;
3136 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003137 if (!PyUnicode_Check(unicode)) {
3138 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003139 return NULL;
3140 }
3141
Serhiy Storchakac46db922018-10-23 22:58:24 +03003142 buflen = unicode_get_widechar_size(unicode);
3143 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003144 if (buffer == NULL) {
3145 PyErr_NoMemory();
3146 return NULL;
3147 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003148 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3149 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003151 }
3152 else if (wcslen(buffer) != (size_t)buflen) {
3153 PyMem_FREE(buffer);
3154 PyErr_SetString(PyExc_ValueError,
3155 "embedded null character");
3156 return NULL;
3157 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003158 return buffer;
3159}
3160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003161#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003165{
Victor Stinner8faf8212011-12-08 22:14:11 +01003166 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 PyErr_SetString(PyExc_ValueError,
3168 "chr() arg not in range(0x110000)");
3169 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003170 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003171
Victor Stinner985a82a2014-01-03 12:53:47 +01003172 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003173}
3174
Alexander Belopolsky40018472011-02-26 01:02:56 +00003175PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003176PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003178 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003180 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003181 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003182 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 Py_INCREF(obj);
3184 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003185 }
3186 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 /* For a Unicode subtype that's not a Unicode object,
3188 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003189 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003190 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003191 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003192 "Can't convert '%.100s' object to str implicitly",
3193 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003194 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003195}
3196
Alexander Belopolsky40018472011-02-26 01:02:56 +00003197PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003198PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003199 const char *encoding,
3200 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003201{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003202 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003204
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 PyErr_BadInternalCall();
3207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003209
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003210 /* Decoding bytes objects is the most common case and should be fast */
3211 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003212 if (PyBytes_GET_SIZE(obj) == 0)
3213 _Py_RETURN_UNICODE_EMPTY();
3214 v = PyUnicode_Decode(
3215 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3216 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003217 return v;
3218 }
3219
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003220 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003221 PyErr_SetString(PyExc_TypeError,
3222 "decoding str is not supported");
3223 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003224 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003225
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003226 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3227 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3228 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003229 "decoding to str: need a bytes-like object, %.80s found",
3230 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003231 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003232 }
Tim Petersced69f82003-09-16 20:30:58 +00003233
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003234 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003235 PyBuffer_Release(&buffer);
3236 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003238
Serhiy Storchaka05997252013-01-26 12:14:02 +02003239 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003240 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003241 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242}
3243
Victor Stinnerebe17e02016-10-12 13:57:45 +02003244/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3245 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3246 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003247int
3248_Py_normalize_encoding(const char *encoding,
3249 char *lower,
3250 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003252 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003253 char *l;
3254 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003255 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256
Victor Stinner942889a2016-09-05 15:40:10 -07003257 assert(encoding != NULL);
3258
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003259 e = encoding;
3260 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003261 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003262 punct = 0;
3263 while (1) {
3264 char c = *e;
3265 if (c == 0) {
3266 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003267 }
Victor Stinner942889a2016-09-05 15:40:10 -07003268
3269 if (Py_ISALNUM(c) || c == '.') {
3270 if (punct && l != lower) {
3271 if (l == l_end) {
3272 return 0;
3273 }
3274 *l++ = '_';
3275 }
3276 punct = 0;
3277
3278 if (l == l_end) {
3279 return 0;
3280 }
3281 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003282 }
3283 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003284 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003285 }
Victor Stinner942889a2016-09-05 15:40:10 -07003286
3287 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003288 }
3289 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003290 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003291}
3292
Alexander Belopolsky40018472011-02-26 01:02:56 +00003293PyObject *
3294PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003295 Py_ssize_t size,
3296 const char *encoding,
3297 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003298{
3299 PyObject *buffer = NULL, *unicode;
3300 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003301 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3302
3303 if (encoding == NULL) {
3304 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3305 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003306
Fred Drakee4315f52000-05-09 19:53:39 +00003307 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003308 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3309 char *lower = buflower;
3310
3311 /* Fast paths */
3312 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3313 lower += 3;
3314 if (*lower == '_') {
3315 /* Match "utf8" and "utf_8" */
3316 lower++;
3317 }
3318
3319 if (lower[0] == '8' && lower[1] == 0) {
3320 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3321 }
3322 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3323 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3324 }
3325 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3326 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3327 }
3328 }
3329 else {
3330 if (strcmp(lower, "ascii") == 0
3331 || strcmp(lower, "us_ascii") == 0) {
3332 return PyUnicode_DecodeASCII(s, size, errors);
3333 }
Steve Dowercc16be82016-09-08 10:35:16 -07003334 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003335 else if (strcmp(lower, "mbcs") == 0) {
3336 return PyUnicode_DecodeMBCS(s, size, errors);
3337 }
3338 #endif
3339 else if (strcmp(lower, "latin1") == 0
3340 || strcmp(lower, "latin_1") == 0
3341 || strcmp(lower, "iso_8859_1") == 0
3342 || strcmp(lower, "iso8859_1") == 0) {
3343 return PyUnicode_DecodeLatin1(s, size, errors);
3344 }
3345 }
Victor Stinner37296e82010-06-10 13:36:23 +00003346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347
3348 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003349 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003350 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003351 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003352 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (buffer == NULL)
3354 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003355 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 if (unicode == NULL)
3357 goto onError;
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003360 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003361 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003362 encoding,
3363 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 Py_DECREF(unicode);
3365 goto onError;
3366 }
3367 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003368 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003369
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 Py_XDECREF(buffer);
3372 return NULL;
3373}
3374
Alexander Belopolsky40018472011-02-26 01:02:56 +00003375PyObject *
3376PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003377 const char *encoding,
3378 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003379{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003380 if (!PyUnicode_Check(unicode)) {
3381 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003382 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003383 }
3384
Serhiy Storchaka00939072016-10-27 21:05:49 +03003385 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3386 "PyUnicode_AsDecodedObject() is deprecated; "
3387 "use PyCodec_Decode() to decode from str", 1) < 0)
3388 return NULL;
3389
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003390 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003392
3393 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003394 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 const char *encoding,
3400 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401{
3402 PyObject *v;
3403
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 goto onError;
3407 }
3408
Serhiy Storchaka00939072016-10-27 21:05:49 +03003409 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3410 "PyUnicode_AsDecodedUnicode() is deprecated; "
3411 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3412 return NULL;
3413
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003414 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003415 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416
3417 /* Decode via the codec registry */
3418 v = PyCodec_Decode(unicode, encoding, errors);
3419 if (v == NULL)
3420 goto onError;
3421 if (!PyUnicode_Check(v)) {
3422 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003423 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003424 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003425 encoding,
3426 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003427 Py_DECREF(v);
3428 goto onError;
3429 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003430 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003431
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003433 return NULL;
3434}
3435
Alexander Belopolsky40018472011-02-26 01:02:56 +00003436PyObject *
3437PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003438 Py_ssize_t size,
3439 const char *encoding,
3440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441{
3442 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003443
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003444 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3448 Py_DECREF(unicode);
3449 return v;
3450}
3451
Alexander Belopolsky40018472011-02-26 01:02:56 +00003452PyObject *
3453PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003454 const char *encoding,
3455 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456{
3457 PyObject *v;
3458
3459 if (!PyUnicode_Check(unicode)) {
3460 PyErr_BadArgument();
3461 goto onError;
3462 }
3463
Serhiy Storchaka00939072016-10-27 21:05:49 +03003464 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3465 "PyUnicode_AsEncodedObject() is deprecated; "
3466 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3467 "or PyCodec_Encode() for generic encoding", 1) < 0)
3468 return NULL;
3469
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003470 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003471 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003472
3473 /* Encode via the codec registry */
3474 v = PyCodec_Encode(unicode, encoding, errors);
3475 if (v == NULL)
3476 goto onError;
3477 return v;
3478
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003480 return NULL;
3481}
3482
Victor Stinner1b579672011-12-17 05:47:23 +01003483
Victor Stinner2cba6b82018-01-10 22:46:15 +01003484static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003485unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003486 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003487{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 Py_ssize_t wlen;
3489 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3490 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003492 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003494 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003495 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003496 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003497 return NULL;
3498 }
3499
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003500 char *str;
3501 size_t error_pos;
3502 const char *reason;
3503 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003504 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003505 PyMem_Free(wstr);
3506
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003507 if (res != 0) {
3508 if (res == -2) {
3509 PyObject *exc;
3510 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3511 "locale", unicode,
3512 (Py_ssize_t)error_pos,
3513 (Py_ssize_t)(error_pos+1),
3514 reason);
3515 if (exc != NULL) {
3516 PyCodec_StrictErrors(exc);
3517 Py_DECREF(exc);
3518 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003519 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003520 else if (res == -3) {
3521 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3522 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003523 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003524 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003525 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003526 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003527 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003528
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003529 PyObject *bytes = PyBytes_FromString(str);
3530 PyMem_RawFree(str);
3531 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003532}
3533
Victor Stinnerad158722010-10-27 00:25:46 +00003534PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003535PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3536{
Victor Stinner709d23d2019-05-02 14:56:30 -04003537 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3538 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003539}
3540
3541PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003542PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003543{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003544 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003545#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003546 if (interp->fs_codec.encoding) {
3547 return unicode_encode_utf8(unicode,
3548 interp->fs_codec.error_handler,
3549 interp->fs_codec.errors);
3550 }
3551 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003552 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003553 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003554 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003555 assert(errors != _Py_ERROR_UNKNOWN);
3556 return unicode_encode_utf8(unicode, errors, NULL);
3557 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003558#else
Victor Stinner793b5312011-04-27 00:24:21 +02003559 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3560 cannot use it to encode and decode filenames before it is loaded. Load
3561 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003562 implementation of the locale codec until the codec registry is
3563 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003564 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003565 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003566 interp->fs_codec.encoding,
3567 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003568 }
3569 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003570 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003571 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003572 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003573 assert(errors != _Py_ERROR_UNKNOWN);
3574 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003575 }
Victor Stinnerad158722010-10-27 00:25:46 +00003576#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003577}
3578
Alexander Belopolsky40018472011-02-26 01:02:56 +00003579PyObject *
3580PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003581 const char *encoding,
3582 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583{
3584 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003585 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (!PyUnicode_Check(unicode)) {
3588 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 }
Fred Drakee4315f52000-05-09 19:53:39 +00003591
Victor Stinner942889a2016-09-05 15:40:10 -07003592 if (encoding == NULL) {
3593 return _PyUnicode_AsUTF8String(unicode, errors);
3594 }
3595
Fred Drakee4315f52000-05-09 19:53:39 +00003596 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003597 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3598 char *lower = buflower;
3599
3600 /* Fast paths */
3601 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3602 lower += 3;
3603 if (*lower == '_') {
3604 /* Match "utf8" and "utf_8" */
3605 lower++;
3606 }
3607
3608 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003610 }
3611 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3612 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3613 }
3614 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3615 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3616 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003617 }
Victor Stinner942889a2016-09-05 15:40:10 -07003618 else {
3619 if (strcmp(lower, "ascii") == 0
3620 || strcmp(lower, "us_ascii") == 0) {
3621 return _PyUnicode_AsASCIIString(unicode, errors);
3622 }
Steve Dowercc16be82016-09-08 10:35:16 -07003623#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003624 else if (strcmp(lower, "mbcs") == 0) {
3625 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3626 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003627#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003628 else if (strcmp(lower, "latin1") == 0 ||
3629 strcmp(lower, "latin_1") == 0 ||
3630 strcmp(lower, "iso_8859_1") == 0 ||
3631 strcmp(lower, "iso8859_1") == 0) {
3632 return _PyUnicode_AsLatin1String(unicode, errors);
3633 }
3634 }
Victor Stinner37296e82010-06-10 13:36:23 +00003635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
3637 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003638 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003640 return NULL;
3641
3642 /* The normal path */
3643 if (PyBytes_Check(v))
3644 return v;
3645
3646 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003648 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003649 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003650
3651 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "encoder %s returned bytearray instead of bytes; "
3653 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003654 encoding);
3655 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003656 Py_DECREF(v);
3657 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003660 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3661 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003662 Py_DECREF(v);
3663 return b;
3664 }
3665
3666 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003667 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003668 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003669 encoding,
3670 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003671 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003672 return NULL;
3673}
3674
Alexander Belopolsky40018472011-02-26 01:02:56 +00003675PyObject *
3676PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003677 const char *encoding,
3678 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003679{
3680 PyObject *v;
3681
3682 if (!PyUnicode_Check(unicode)) {
3683 PyErr_BadArgument();
3684 goto onError;
3685 }
3686
Serhiy Storchaka00939072016-10-27 21:05:49 +03003687 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3688 "PyUnicode_AsEncodedUnicode() is deprecated; "
3689 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3690 return NULL;
3691
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003692 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003694
3695 /* Encode via the codec registry */
3696 v = PyCodec_Encode(unicode, encoding, errors);
3697 if (v == NULL)
3698 goto onError;
3699 if (!PyUnicode_Check(v)) {
3700 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003701 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003702 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003703 encoding,
3704 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003705 Py_DECREF(v);
3706 goto onError;
3707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003709
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 return NULL;
3712}
3713
Victor Stinner2cba6b82018-01-10 22:46:15 +01003714static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003715unicode_decode_locale(const char *str, Py_ssize_t len,
3716 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003718 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3719 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003720 return NULL;
3721 }
3722
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003723 wchar_t *wstr;
3724 size_t wlen;
3725 const char *reason;
3726 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003727 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003728 if (res != 0) {
3729 if (res == -2) {
3730 PyObject *exc;
3731 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3732 "locale", str, len,
3733 (Py_ssize_t)wlen,
3734 (Py_ssize_t)(wlen + 1),
3735 reason);
3736 if (exc != NULL) {
3737 PyCodec_StrictErrors(exc);
3738 Py_DECREF(exc);
3739 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003740 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003741 else if (res == -3) {
3742 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3743 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003744 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003745 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003746 }
Victor Stinner2f197072011-12-17 07:08:30 +01003747 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003748 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003749
3750 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3751 PyMem_RawFree(wstr);
3752 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003753}
3754
3755PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003756PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3757 const char *errors)
3758{
Victor Stinner709d23d2019-05-02 14:56:30 -04003759 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3760 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003761}
3762
3763PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003764PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003765{
3766 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003767 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3768 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003769}
3770
3771
3772PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003773PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003774 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003775 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3776}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003777
Christian Heimes5894ba72007-11-04 11:43:14 +00003778PyObject*
3779PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3780{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003781 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003782#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003783 if (interp->fs_codec.encoding) {
3784 return unicode_decode_utf8(s, size,
3785 interp->fs_codec.error_handler,
3786 interp->fs_codec.errors,
3787 NULL);
3788 }
3789 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003790 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003791 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003792 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003793 assert(errors != _Py_ERROR_UNKNOWN);
3794 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3795 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003796#else
Victor Stinner793b5312011-04-27 00:24:21 +02003797 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3798 cannot use it to encode and decode filenames before it is loaded. Load
3799 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003800 implementation of the locale codec until the codec registry is
3801 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003802 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003803 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003804 interp->fs_codec.encoding,
3805 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003806 }
3807 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003808 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003809 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003810 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003811 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003812 }
Victor Stinnerad158722010-10-27 00:25:46 +00003813#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003814}
3815
Martin v. Löwis011e8422009-05-05 04:43:17 +00003816
3817int
3818PyUnicode_FSConverter(PyObject* arg, void* addr)
3819{
Brett Cannonec6ce872016-09-06 15:50:29 -07003820 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003821 PyObject *output = NULL;
3822 Py_ssize_t size;
3823 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003824 if (arg == NULL) {
3825 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003826 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003827 return 1;
3828 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003829 path = PyOS_FSPath(arg);
3830 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003831 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003832 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003833 if (PyBytes_Check(path)) {
3834 output = path;
3835 }
3836 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3837 output = PyUnicode_EncodeFSDefault(path);
3838 Py_DECREF(path);
3839 if (!output) {
3840 return 0;
3841 }
3842 assert(PyBytes_Check(output));
3843 }
3844
Victor Stinner0ea2a462010-04-30 00:22:08 +00003845 size = PyBytes_GET_SIZE(output);
3846 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003847 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003848 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003849 Py_DECREF(output);
3850 return 0;
3851 }
3852 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003853 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003854}
3855
3856
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003857int
3858PyUnicode_FSDecoder(PyObject* arg, void* addr)
3859{
Brett Cannona5711202016-09-06 19:36:01 -07003860 int is_buffer = 0;
3861 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003862 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003863 if (arg == NULL) {
3864 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003865 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003866 return 1;
3867 }
Brett Cannona5711202016-09-06 19:36:01 -07003868
3869 is_buffer = PyObject_CheckBuffer(arg);
3870 if (!is_buffer) {
3871 path = PyOS_FSPath(arg);
3872 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003873 return 0;
3874 }
Brett Cannona5711202016-09-06 19:36:01 -07003875 }
3876 else {
3877 path = arg;
3878 Py_INCREF(arg);
3879 }
3880
3881 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003882 output = path;
3883 }
3884 else if (PyBytes_Check(path) || is_buffer) {
3885 PyObject *path_bytes = NULL;
3886
3887 if (!PyBytes_Check(path) &&
3888 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003889 "path should be string, bytes, or os.PathLike, not %.200s",
3890 Py_TYPE(arg)->tp_name)) {
3891 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003892 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003893 }
3894 path_bytes = PyBytes_FromObject(path);
3895 Py_DECREF(path);
3896 if (!path_bytes) {
3897 return 0;
3898 }
3899 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3900 PyBytes_GET_SIZE(path_bytes));
3901 Py_DECREF(path_bytes);
3902 if (!output) {
3903 return 0;
3904 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003905 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003906 else {
3907 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003908 "path should be string, bytes, or os.PathLike, not %.200s",
3909 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003910 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003911 return 0;
3912 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003913 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003914 Py_DECREF(output);
3915 return 0;
3916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003918 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003919 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003920 Py_DECREF(output);
3921 return 0;
3922 }
3923 *(PyObject**)addr = output;
3924 return Py_CLEANUP_SUPPORTED;
3925}
3926
3927
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003928const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003930{
Christian Heimesf3863112007-11-22 07:46:41 +00003931 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003933 if (!PyUnicode_Check(unicode)) {
3934 PyErr_BadArgument();
3935 return NULL;
3936 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003937 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003940 if (PyUnicode_UTF8(unicode) == NULL) {
3941 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003942 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 if (bytes == NULL)
3944 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3946 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003947 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 Py_DECREF(bytes);
3949 return NULL;
3950 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003951 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003952 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 PyBytes_AS_STRING(bytes),
3954 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 Py_DECREF(bytes);
3956 }
3957
3958 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003959 *psize = PyUnicode_UTF8_LENGTH(unicode);
3960 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003961}
3962
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003963const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3967}
3968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969Py_UNICODE *
3970PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 if (!PyUnicode_Check(unicode)) {
3973 PyErr_BadArgument();
3974 return NULL;
3975 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003976 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3977 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003979 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981
Serhiy Storchakac46db922018-10-23 22:58:24 +03003982 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3983 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3984 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003987 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3988 if (w == NULL) {
3989 PyErr_NoMemory();
3990 return NULL;
3991 }
3992 unicode_copy_as_widechar(unicode, w, wlen + 1);
3993 _PyUnicode_WSTR(unicode) = w;
3994 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3995 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 }
3997 }
3998 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004000 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004001}
4002
Alexander Belopolsky40018472011-02-26 01:02:56 +00004003Py_UNICODE *
4004PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007}
4008
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004009const Py_UNICODE *
4010_PyUnicode_AsUnicode(PyObject *unicode)
4011{
4012 Py_ssize_t size;
4013 const Py_UNICODE *wstr;
4014
4015 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4016 if (wstr && wcslen(wstr) != (size_t)size) {
4017 PyErr_SetString(PyExc_ValueError, "embedded null character");
4018 return NULL;
4019 }
4020 return wstr;
4021}
4022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023
Alexander Belopolsky40018472011-02-26 01:02:56 +00004024Py_ssize_t
4025PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026{
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 goto onError;
4030 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004031 if (_PyUnicode_WSTR(unicode) == NULL) {
4032 if (PyUnicode_AsUnicode(unicode) == NULL)
4033 goto onError;
4034 }
4035 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 return -1;
4039}
4040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041Py_ssize_t
4042PyUnicode_GetLength(PyObject *unicode)
4043{
Victor Stinner07621332012-06-16 04:53:46 +02004044 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 PyErr_BadArgument();
4046 return -1;
4047 }
Victor Stinner07621332012-06-16 04:53:46 +02004048 if (PyUnicode_READY(unicode) == -1)
4049 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 return PyUnicode_GET_LENGTH(unicode);
4051}
4052
4053Py_UCS4
4054PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4055{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004056 void *data;
4057 int kind;
4058
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004059 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004060 PyErr_BadArgument();
4061 return (Py_UCS4)-1;
4062 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004063 if (PyUnicode_READY(unicode) == -1) {
4064 return (Py_UCS4)-1;
4065 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004066 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004067 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 return (Py_UCS4)-1;
4069 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004070 data = PyUnicode_DATA(unicode);
4071 kind = PyUnicode_KIND(unicode);
4072 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004073}
4074
4075int
4076PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4077{
4078 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004079 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 return -1;
4081 }
Victor Stinner488fa492011-12-12 00:01:39 +01004082 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004083 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004084 PyErr_SetString(PyExc_IndexError, "string index out of range");
4085 return -1;
4086 }
Victor Stinner488fa492011-12-12 00:01:39 +01004087 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004088 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004089 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4090 PyErr_SetString(PyExc_ValueError, "character out of range");
4091 return -1;
4092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4094 index, ch);
4095 return 0;
4096}
4097
Alexander Belopolsky40018472011-02-26 01:02:56 +00004098const char *
4099PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004100{
Victor Stinner42cb4622010-09-01 19:39:01 +00004101 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004102}
4103
Victor Stinner554f3f02010-06-16 23:33:54 +00004104/* create or adjust a UnicodeDecodeError */
4105static void
4106make_decode_exception(PyObject **exceptionObject,
4107 const char *encoding,
4108 const char *input, Py_ssize_t length,
4109 Py_ssize_t startpos, Py_ssize_t endpos,
4110 const char *reason)
4111{
4112 if (*exceptionObject == NULL) {
4113 *exceptionObject = PyUnicodeDecodeError_Create(
4114 encoding, input, length, startpos, endpos, reason);
4115 }
4116 else {
4117 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4118 goto onError;
4119 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4120 goto onError;
4121 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4122 goto onError;
4123 }
4124 return;
4125
4126onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004127 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004128}
4129
Steve Dowercc16be82016-09-08 10:35:16 -07004130#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004131static int
4132widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4133{
4134 if (newsize > *size) {
4135 wchar_t *newbuf = *buf;
4136 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4137 PyErr_NoMemory();
4138 return -1;
4139 }
4140 *buf = newbuf;
4141 }
4142 *size = newsize;
4143 return 0;
4144}
4145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146/* error handling callback helper:
4147 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004148 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 and adjust various state variables.
4150 return 0 on success, -1 on error
4151*/
4152
Alexander Belopolsky40018472011-02-26 01:02:56 +00004153static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154unicode_decode_call_errorhandler_wchar(
4155 const char *errors, PyObject **errorHandler,
4156 const char *encoding, const char *reason,
4157 const char **input, const char **inend, Py_ssize_t *startinpos,
4158 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004159 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004161 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162
4163 PyObject *restuple = NULL;
4164 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004165 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004166 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t requiredsize;
4168 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004169 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 wchar_t *repwstr;
4171 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172
4173 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 *errorHandler = PyCodec_LookupError(errors);
4175 if (*errorHandler == NULL)
4176 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 }
4178
Victor Stinner554f3f02010-06-16 23:33:54 +00004179 make_decode_exception(exceptionObject,
4180 encoding,
4181 *input, *inend - *input,
4182 *startinpos, *endinpos,
4183 reason);
4184 if (*exceptionObject == NULL)
4185 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004187 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004191 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004194 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004196
4197 /* Copy back the bytes variables, which might have been modified by the
4198 callback */
4199 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4200 if (!inputobj)
4201 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202 *input = PyBytes_AS_STRING(inputobj);
4203 insize = PyBytes_GET_SIZE(inputobj);
4204 *inend = *input + insize;
4205 /* we can DECREF safely, as the exception has another reference,
4206 so the object won't go away. */
4207 Py_DECREF(inputobj);
4208
4209 if (newpos<0)
4210 newpos = insize+newpos;
4211 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004212 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213 goto onError;
4214 }
4215
4216 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4217 if (repwstr == NULL)
4218 goto onError;
4219 /* need more space? (at least enough for what we
4220 have+the replacement+the rest of the string (starting
4221 at the new input position), so we won't have to check space
4222 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004223 requiredsize = *outpos;
4224 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4225 goto overflow;
4226 requiredsize += repwlen;
4227 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4228 goto overflow;
4229 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004230 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004231 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004232 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004234 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004236 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004238 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 *endinpos = newpos;
4241 *inptr = *input + newpos;
4242
4243 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004244 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245 return 0;
4246
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004247 overflow:
4248 PyErr_SetString(PyExc_OverflowError,
4249 "decoded result is too long for a Python string");
4250
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 onError:
4252 Py_XDECREF(restuple);
4253 return -1;
4254}
Steve Dowercc16be82016-09-08 10:35:16 -07004255#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004256
4257static int
4258unicode_decode_call_errorhandler_writer(
4259 const char *errors, PyObject **errorHandler,
4260 const char *encoding, const char *reason,
4261 const char **input, const char **inend, Py_ssize_t *startinpos,
4262 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4263 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4264{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004265 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266
4267 PyObject *restuple = NULL;
4268 PyObject *repunicode = NULL;
4269 Py_ssize_t insize;
4270 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004271 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004272 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004274 int need_to_grow = 0;
4275 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004276
4277 if (*errorHandler == NULL) {
4278 *errorHandler = PyCodec_LookupError(errors);
4279 if (*errorHandler == NULL)
4280 goto onError;
4281 }
4282
4283 make_decode_exception(exceptionObject,
4284 encoding,
4285 *input, *inend - *input,
4286 *startinpos, *endinpos,
4287 reason);
4288 if (*exceptionObject == NULL)
4289 goto onError;
4290
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004291 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 if (restuple == NULL)
4293 goto onError;
4294 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 goto onError;
4297 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004298 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004300
4301 /* Copy back the bytes variables, which might have been modified by the
4302 callback */
4303 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4304 if (!inputobj)
4305 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004306 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004307 *input = PyBytes_AS_STRING(inputobj);
4308 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004309 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004310 /* we can DECREF safely, as the exception has another reference,
4311 so the object won't go away. */
4312 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004316 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004317 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320
Victor Stinner170ca6f2013-04-18 00:25:28 +02004321 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004322 if (replen > 1) {
4323 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004324 need_to_grow = 1;
4325 }
4326 new_inptr = *input + newpos;
4327 if (*inend - new_inptr > remain) {
4328 /* We don't know the decoding algorithm here so we make the worst
4329 assumption that one byte decodes to one unicode character.
4330 If unfortunately one byte could decode to more unicode characters,
4331 the decoder may write out-of-bound then. Is it possible for the
4332 algorithms using this function? */
4333 writer->min_length += *inend - new_inptr - remain;
4334 need_to_grow = 1;
4335 }
4336 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004337 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004338 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004339 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4340 goto onError;
4341 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004343 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004346 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004349 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355}
4356
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357/* --- UTF-7 Codec -------------------------------------------------------- */
4358
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359/* See RFC2152 for details. We encode conservatively and decode liberally. */
4360
4361/* Three simple macros defining base-64. */
4362
4363/* Is c a base-64 character? */
4364
4365#define IS_BASE64(c) \
4366 (((c) >= 'A' && (c) <= 'Z') || \
4367 ((c) >= 'a' && (c) <= 'z') || \
4368 ((c) >= '0' && (c) <= '9') || \
4369 (c) == '+' || (c) == '/')
4370
4371/* given that c is a base-64 character, what is its base-64 value? */
4372
4373#define FROM_BASE64(c) \
4374 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4375 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4376 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4377 (c) == '+' ? 62 : 63)
4378
4379/* What is the base-64 character of the bottom 6 bits of n? */
4380
4381#define TO_BASE64(n) \
4382 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4383
4384/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4385 * decoded as itself. We are permissive on decoding; the only ASCII
4386 * byte not decoding to itself is the + which begins a base64
4387 * string. */
4388
4389#define DECODE_DIRECT(c) \
4390 ((c) <= 127 && (c) != '+')
4391
4392/* The UTF-7 encoder treats ASCII characters differently according to
4393 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4394 * the above). See RFC2152. This array identifies these different
4395 * sets:
4396 * 0 : "Set D"
4397 * alphanumeric and '(),-./:?
4398 * 1 : "Set O"
4399 * !"#$%&*;<=>@[]^_`{|}
4400 * 2 : "whitespace"
4401 * ht nl cr sp
4402 * 3 : special (must be base64 encoded)
4403 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4404 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405
Tim Petersced69f82003-09-16 20:30:58 +00004406static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407char utf7_category[128] = {
4408/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4409 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4410/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4411 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4412/* sp ! " # $ % & ' ( ) * + , - . / */
4413 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4414/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4416/* @ A B C D E F G H I J K L M N O */
4417 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4418/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4420/* ` a b c d e f g h i j k l m n o */
4421 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4422/* p q r s t u v w x y z { | } ~ del */
4423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424};
4425
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426/* ENCODE_DIRECT: this character should be encoded as itself. The
4427 * answer depends on whether we are encoding set O as itself, and also
4428 * on whether we are encoding whitespace as itself. RFC2152 makes it
4429 * clear that the answers to these questions vary between
4430 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004431
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432#define ENCODE_DIRECT(c, directO, directWS) \
4433 ((c) < 128 && (c) > 0 && \
4434 ((utf7_category[(c)] == 0) || \
4435 (directWS && (utf7_category[(c)] == 2)) || \
4436 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437
Alexander Belopolsky40018472011-02-26 01:02:56 +00004438PyObject *
4439PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004440 Py_ssize_t size,
4441 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004443 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4444}
4445
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446/* The decoder. The only state we preserve is our read position,
4447 * i.e. how many characters we have consumed. So if we end in the
4448 * middle of a shift sequence we have to back off the read position
4449 * and the output to the beginning of the sequence, otherwise we lose
4450 * all the shift state (seen bits, number of bits seen, high
4451 * surrogate). */
4452
Alexander Belopolsky40018472011-02-26 01:02:56 +00004453PyObject *
4454PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004455 Py_ssize_t size,
4456 const char *errors,
4457 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004460 Py_ssize_t startinpos;
4461 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 const char *errmsg = "";
4465 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004466 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 unsigned int base64bits = 0;
4468 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004469 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 PyObject *errorHandler = NULL;
4471 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004473 if (size == 0) {
4474 if (consumed)
4475 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004476 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004477 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004479 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004480 _PyUnicodeWriter_Init(&writer);
4481 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004482
4483 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 e = s + size;
4485
4486 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004487 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004489 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 if (inShift) { /* in a base-64 section */
4492 if (IS_BASE64(ch)) { /* consume a base-64 character */
4493 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4494 base64bits += 6;
4495 s++;
4496 if (base64bits >= 16) {
4497 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004498 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 base64bits -= 16;
4500 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004501 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 if (surrogate) {
4503 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004504 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4505 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004506 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004507 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004509 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 }
4511 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004512 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004513 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 }
4516 }
Victor Stinner551ac952011-11-29 22:58:13 +01004517 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 /* first surrogate */
4519 surrogate = outCh;
4520 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004522 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004523 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 }
4525 }
4526 }
4527 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 if (base64bits > 0) { /* left-over bits */
4530 if (base64bits >= 6) {
4531 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004532 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 errmsg = "partial character in shift sequence";
4534 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 else {
4537 /* Some bits remain; they should be zero */
4538 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004539 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 errmsg = "non-zero padding bits in shift sequence";
4541 goto utf7Error;
4542 }
4543 }
4544 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004545 if (surrogate && DECODE_DIRECT(ch)) {
4546 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4547 goto onError;
4548 }
4549 surrogate = 0;
4550 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 /* '-' is absorbed; other terminating
4552 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004553 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
4556 }
4557 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 s++; /* consume '+' */
4560 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004562 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004563 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004565 else if (s < e && !IS_BASE64(*s)) {
4566 s++;
4567 errmsg = "ill-formed sequence";
4568 goto utf7Error;
4569 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004572 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004575 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576 }
4577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004580 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004581 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 else {
4584 startinpos = s-starts;
4585 s++;
4586 errmsg = "unexpected special character";
4587 goto utf7Error;
4588 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004592 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 errors, &errorHandler,
4594 "utf7", errmsg,
4595 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004596 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 }
4599
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* end of string */
4601
4602 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4603 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004604 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 if (surrogate ||
4606 (base64bits >= 6) ||
4607 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004609 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 errors, &errorHandler,
4611 "utf7", "unterminated shift sequence",
4612 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004613 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 goto onError;
4615 if (s < e)
4616 goto restart;
4617 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619
4620 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004621 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004623 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004624 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004625 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004626 writer.kind, writer.data, shiftOutStart);
4627 Py_XDECREF(errorHandler);
4628 Py_XDECREF(exc);
4629 _PyUnicodeWriter_Dealloc(&writer);
4630 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004631 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004632 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 }
4634 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004635 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 Py_XDECREF(errorHandler);
4640 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004641 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 Py_XDECREF(errorHandler);
4645 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 return NULL;
4648}
4649
4650
Alexander Belopolsky40018472011-02-26 01:02:56 +00004651PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004652_PyUnicode_EncodeUTF7(PyObject *str,
4653 int base64SetO,
4654 int base64WhiteSpace,
4655 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004657 int kind;
4658 void *data;
4659 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004661 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004662 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 unsigned int base64bits = 0;
4664 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 char * out;
4666 char * start;
4667
Benjamin Petersonbac79492012-01-14 13:34:47 -05004668 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004669 return NULL;
4670 kind = PyUnicode_KIND(str);
4671 data = PyUnicode_DATA(str);
4672 len = PyUnicode_GET_LENGTH(str);
4673
4674 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004677 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004678 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004679 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004680 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681 if (v == NULL)
4682 return NULL;
4683
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004684 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004685 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004686 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 if (inShift) {
4689 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4690 /* shifting out */
4691 if (base64bits) { /* output remaining bits */
4692 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4693 base64buffer = 0;
4694 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
4696 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 /* Characters not in the BASE64 set implicitly unshift the sequence
4698 so no '-' is required, except if the character is itself a '-' */
4699 if (IS_BASE64(ch) || ch == '-') {
4700 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 *out++ = (char) ch;
4703 }
4704 else {
4705 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 else { /* not in a shift sequence */
4709 if (ch == '+') {
4710 *out++ = '+';
4711 *out++ = '-';
4712 }
4713 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4714 *out++ = (char) ch;
4715 }
4716 else {
4717 *out++ = '+';
4718 inShift = 1;
4719 goto encode_char;
4720 }
4721 }
4722 continue;
4723encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004725 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004726
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 /* code first surrogate */
4728 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004729 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730 while (base64bits >= 6) {
4731 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4732 base64bits -= 6;
4733 }
4734 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004735 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004736 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 base64bits += 16;
4738 base64buffer = (base64buffer << 16) | ch;
4739 while (base64bits >= 6) {
4740 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4741 base64bits -= 6;
4742 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004743 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744 if (base64bits)
4745 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4746 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 if (_PyBytes_Resize(&v, out - start) < 0)
4749 return NULL;
4750 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004752PyObject *
4753PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4754 Py_ssize_t size,
4755 int base64SetO,
4756 int base64WhiteSpace,
4757 const char *errors)
4758{
4759 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004760 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004761 if (tmp == NULL)
4762 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004763 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004764 base64WhiteSpace, errors);
4765 Py_DECREF(tmp);
4766 return result;
4767}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004768
Antoine Pitrou244651a2009-05-04 18:56:13 +00004769#undef IS_BASE64
4770#undef FROM_BASE64
4771#undef TO_BASE64
4772#undef DECODE_DIRECT
4773#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775/* --- UTF-8 Codec -------------------------------------------------------- */
4776
Alexander Belopolsky40018472011-02-26 01:02:56 +00004777PyObject *
4778PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004779 Py_ssize_t size,
4780 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781{
Walter Dörwald69652032004-09-07 20:24:22 +00004782 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4783}
4784
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785#include "stringlib/asciilib.h"
4786#include "stringlib/codecs.h"
4787#include "stringlib/undef.h"
4788
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004789#include "stringlib/ucs1lib.h"
4790#include "stringlib/codecs.h"
4791#include "stringlib/undef.h"
4792
4793#include "stringlib/ucs2lib.h"
4794#include "stringlib/codecs.h"
4795#include "stringlib/undef.h"
4796
4797#include "stringlib/ucs4lib.h"
4798#include "stringlib/codecs.h"
4799#include "stringlib/undef.h"
4800
Antoine Pitrouab868312009-01-10 15:40:25 +00004801/* Mask to quickly check whether a C 'long' contains a
4802 non-ASCII, UTF8-encoded char. */
4803#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004804# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004805#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004806# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004807#else
4808# error C 'long' size should be either 4 or 8!
4809#endif
4810
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811static Py_ssize_t
4812ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004814 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004815 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004817 /*
4818 * Issue #17237: m68k is a bit different from most architectures in
4819 * that objects do not use "natural alignment" - for example, int and
4820 * long are only aligned at 2-byte boundaries. Therefore the assert()
4821 * won't work; also, tests have shown that skipping the "optimised
4822 * version" will even speed up m68k.
4823 */
4824#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004825#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004826 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4827 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 /* Fast path, see in STRINGLIB(utf8_decode) for
4829 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004830 /* Help allocation */
4831 const char *_p = p;
4832 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 while (_p < aligned_end) {
4834 unsigned long value = *(const unsigned long *) _p;
4835 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 *((unsigned long *)q) = value;
4838 _p += SIZEOF_LONG;
4839 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 p = _p;
4842 while (p < end) {
4843 if ((unsigned char)*p & 0x80)
4844 break;
4845 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004850#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851 while (p < end) {
4852 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4853 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004854 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004855 /* Help allocation */
4856 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 while (_p < aligned_end) {
4858 unsigned long value = *(unsigned long *) _p;
4859 if (value & ASCII_CHAR_MASK)
4860 break;
4861 _p += SIZEOF_LONG;
4862 }
4863 p = _p;
4864 if (_p == end)
4865 break;
4866 }
4867 if ((unsigned char)*p & 0x80)
4868 break;
4869 ++p;
4870 }
4871 memcpy(dest, start, p - start);
4872 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873}
Antoine Pitrouab868312009-01-10 15:40:25 +00004874
Victor Stinner709d23d2019-05-02 14:56:30 -04004875static PyObject *
4876unicode_decode_utf8(const char *s, Py_ssize_t size,
4877 _Py_error_handler error_handler, const char *errors,
4878 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004879{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004880 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004881 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883
4884 Py_ssize_t startinpos;
4885 Py_ssize_t endinpos;
4886 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004887 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004889
4890 if (size == 0) {
4891 if (consumed)
4892 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004893 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004894 }
4895
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4897 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004898 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 *consumed = 1;
4900 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004901 }
4902
Victor Stinner8f674cc2013-04-17 23:02:17 +02004903 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004904 writer.min_length = size;
4905 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004906 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004907
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004908 writer.pos = ascii_decode(s, end, writer.data);
4909 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 while (s < end) {
4911 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004912 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004913
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004915 if (PyUnicode_IS_ASCII(writer.buffer))
4916 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004918 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004920 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 } else {
4922 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 }
4925
4926 switch (ch) {
4927 case 0:
4928 if (s == end || consumed)
4929 goto End;
4930 errmsg = "unexpected end of data";
4931 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004932 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004933 break;
4934 case 1:
4935 errmsg = "invalid start byte";
4936 startinpos = s - starts;
4937 endinpos = startinpos + 1;
4938 break;
4939 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004940 case 3:
4941 case 4:
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004942 if (s == end || consumed) {
4943 goto End;
4944 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 errmsg = "invalid continuation byte";
4946 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004947 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948 break;
4949 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004950 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 goto onError;
4952 continue;
4953 }
4954
Victor Stinner1d65d912015-10-05 13:43:50 +02004955 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004956 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004957
4958 switch (error_handler) {
4959 case _Py_ERROR_IGNORE:
4960 s += (endinpos - startinpos);
4961 break;
4962
4963 case _Py_ERROR_REPLACE:
4964 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4965 goto onError;
4966 s += (endinpos - startinpos);
4967 break;
4968
4969 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004970 {
4971 Py_ssize_t i;
4972
Victor Stinner1d65d912015-10-05 13:43:50 +02004973 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4974 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004975 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004976 ch = (Py_UCS4)(unsigned char)(starts[i]);
4977 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4978 ch + 0xdc00);
4979 writer.pos++;
4980 }
4981 s += (endinpos - startinpos);
4982 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004983 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004984
4985 default:
4986 if (unicode_decode_call_errorhandler_writer(
4987 errors, &error_handler_obj,
4988 "utf-8", errmsg,
4989 &starts, &end, &startinpos, &endinpos, &exc, &s,
4990 &writer))
4991 goto onError;
4992 }
Victor Stinner785938e2011-12-11 20:09:03 +01004993 }
4994
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 if (consumed)
4997 *consumed = s - starts;
4998
Victor Stinner1d65d912015-10-05 13:43:50 +02004999 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005001 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002
5003onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005004 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005008}
5009
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005010
Victor Stinner709d23d2019-05-02 14:56:30 -04005011PyObject *
5012PyUnicode_DecodeUTF8Stateful(const char *s,
5013 Py_ssize_t size,
5014 const char *errors,
5015 Py_ssize_t *consumed)
5016{
5017 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5018}
5019
5020
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005021/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5022 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005023
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005024 On success, write a pointer to a newly allocated wide character string into
5025 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5026 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005027
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005028 On memory allocation failure, return -1.
5029
5030 On decoding error (if surrogateescape is zero), return -2. If wlen is
5031 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5032 is not NULL, write the decoding error message into *reason. */
5033int
5034_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005035 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005036{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005037 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005038 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 wchar_t *unicode;
5040 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005041
Victor Stinner3d4226a2018-08-29 22:21:32 +02005042 int surrogateescape = 0;
5043 int surrogatepass = 0;
5044 switch (errors)
5045 {
5046 case _Py_ERROR_STRICT:
5047 break;
5048 case _Py_ERROR_SURROGATEESCAPE:
5049 surrogateescape = 1;
5050 break;
5051 case _Py_ERROR_SURROGATEPASS:
5052 surrogatepass = 1;
5053 break;
5054 default:
5055 return -3;
5056 }
5057
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005058 /* Note: size will always be longer than the resulting Unicode
5059 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005060 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005061 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005062 }
5063
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005064 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005065 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005066 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005067 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005068
5069 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005070 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005072 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005076#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005078#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 if (ch > 0xFF) {
5080#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005081 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005083 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005084 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5086 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5087#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005090 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005091 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005092 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005093
5094 if (surrogateescape) {
5095 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5096 }
5097 else {
5098 /* Is it a valid three-byte code? */
5099 if (surrogatepass
5100 && (e - s) >= 3
5101 && (s[0] & 0xf0) == 0xe0
5102 && (s[1] & 0xc0) == 0x80
5103 && (s[2] & 0xc0) == 0x80)
5104 {
5105 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5106 s += 3;
5107 unicode[outpos++] = ch;
5108 }
5109 else {
5110 PyMem_RawFree(unicode );
5111 if (reason != NULL) {
5112 switch (ch) {
5113 case 0:
5114 *reason = "unexpected end of data";
5115 break;
5116 case 1:
5117 *reason = "invalid start byte";
5118 break;
5119 /* 2, 3, 4 */
5120 default:
5121 *reason = "invalid continuation byte";
5122 break;
5123 }
5124 }
5125 if (wlen != NULL) {
5126 *wlen = s - orig_s;
5127 }
5128 return -2;
5129 }
5130 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005133 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005134 if (wlen) {
5135 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005136 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005137 *wstr = unicode;
5138 return 0;
5139}
5140
Victor Stinner5f9cf232019-03-19 01:46:25 +01005141
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005143_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5144 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005145{
5146 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005147 int res = _Py_DecodeUTF8Ex(arg, arglen,
5148 &wstr, wlen,
5149 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005150 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005151 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5152 assert(res != -3);
5153 if (wlen) {
5154 *wlen = (size_t)res;
5155 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005156 return NULL;
5157 }
5158 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005159}
5160
Antoine Pitrouab868312009-01-10 15:40:25 +00005161
Victor Stinnere47e6982017-12-21 15:45:16 +01005162/* UTF-8 encoder using the surrogateescape error handler .
5163
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005164 On success, return 0 and write the newly allocated character string (use
5165 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005166
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005167 On encoding failure, return -2 and write the position of the invalid
5168 surrogate character into *error_pos (if error_pos is set) and the decoding
5169 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005170
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005171 On memory allocation failure, return -1. */
5172int
5173_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005174 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005175{
5176 const Py_ssize_t max_char_size = 4;
5177 Py_ssize_t len = wcslen(text);
5178
5179 assert(len >= 0);
5180
Victor Stinner3d4226a2018-08-29 22:21:32 +02005181 int surrogateescape = 0;
5182 int surrogatepass = 0;
5183 switch (errors)
5184 {
5185 case _Py_ERROR_STRICT:
5186 break;
5187 case _Py_ERROR_SURROGATEESCAPE:
5188 surrogateescape = 1;
5189 break;
5190 case _Py_ERROR_SURROGATEPASS:
5191 surrogatepass = 1;
5192 break;
5193 default:
5194 return -3;
5195 }
5196
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005197 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5198 return -1;
5199 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005200 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 if (raw_malloc) {
5202 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005203 }
5204 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005205 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005206 }
5207 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005208 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005209 }
5210
5211 char *p = bytes;
5212 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005213 for (i = 0; i < len; ) {
5214 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005215 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005216 i++;
5217#if Py_UNICODE_SIZE == 2
5218 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5219 && i < len
5220 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5221 {
5222 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5223 i++;
5224 }
5225#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005226
5227 if (ch < 0x80) {
5228 /* Encode ASCII */
5229 *p++ = (char) ch;
5230
5231 }
5232 else if (ch < 0x0800) {
5233 /* Encode Latin-1 */
5234 *p++ = (char)(0xc0 | (ch >> 6));
5235 *p++ = (char)(0x80 | (ch & 0x3f));
5236 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005237 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005238 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005239 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005240 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005241 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005242 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243 if (reason != NULL) {
5244 *reason = "encoding error";
5245 }
5246 if (raw_malloc) {
5247 PyMem_RawFree(bytes);
5248 }
5249 else {
5250 PyMem_Free(bytes);
5251 }
5252 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005253 }
5254 *p++ = (char)(ch & 0xff);
5255 }
5256 else if (ch < 0x10000) {
5257 *p++ = (char)(0xe0 | (ch >> 12));
5258 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5259 *p++ = (char)(0x80 | (ch & 0x3f));
5260 }
5261 else { /* ch >= 0x10000 */
5262 assert(ch <= MAX_UNICODE);
5263 /* Encode UCS4 Unicode ordinals */
5264 *p++ = (char)(0xf0 | (ch >> 18));
5265 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5266 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5267 *p++ = (char)(0x80 | (ch & 0x3f));
5268 }
5269 }
5270 *p++ = '\0';
5271
5272 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005273 char *bytes2;
5274 if (raw_malloc) {
5275 bytes2 = PyMem_RawRealloc(bytes, final_size);
5276 }
5277 else {
5278 bytes2 = PyMem_Realloc(bytes, final_size);
5279 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005280 if (bytes2 == NULL) {
5281 if (error_pos != NULL) {
5282 *error_pos = (size_t)-1;
5283 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005284 if (raw_malloc) {
5285 PyMem_RawFree(bytes);
5286 }
5287 else {
5288 PyMem_Free(bytes);
5289 }
5290 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005291 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005292 *str = bytes2;
5293 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005294}
5295
5296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005297/* Primary internal function which creates utf8 encoded bytes objects.
5298
5299 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005300 and allocate exactly as much space needed at the end. Else allocate the
5301 maximum possible needed (4 result bytes per Unicode character), and return
5302 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005303*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005304static PyObject *
5305unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5306 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307{
Victor Stinner6099a032011-12-18 14:22:26 +01005308 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005309 void *data;
5310 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005312 if (!PyUnicode_Check(unicode)) {
5313 PyErr_BadArgument();
5314 return NULL;
5315 }
5316
5317 if (PyUnicode_READY(unicode) == -1)
5318 return NULL;
5319
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005320 if (PyUnicode_UTF8(unicode))
5321 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5322 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005323
5324 kind = PyUnicode_KIND(unicode);
5325 data = PyUnicode_DATA(unicode);
5326 size = PyUnicode_GET_LENGTH(unicode);
5327
Benjamin Petersonead6b532011-12-20 17:23:42 -06005328 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005329 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005330 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005331 case PyUnicode_1BYTE_KIND:
5332 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5333 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005334 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005335 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005336 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005337 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005338 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340}
5341
Alexander Belopolsky40018472011-02-26 01:02:56 +00005342PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005343_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5344{
5345 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5346}
5347
5348
5349PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005350PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5351 Py_ssize_t size,
5352 const char *errors)
5353{
5354 PyObject *v, *unicode;
5355
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005356 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005357 if (unicode == NULL)
5358 return NULL;
5359 v = _PyUnicode_AsUTF8String(unicode, errors);
5360 Py_DECREF(unicode);
5361 return v;
5362}
5363
5364PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005365PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005367 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368}
5369
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370/* --- UTF-32 Codec ------------------------------------------------------- */
5371
5372PyObject *
5373PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 Py_ssize_t size,
5375 const char *errors,
5376 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377{
5378 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5379}
5380
5381PyObject *
5382PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 Py_ssize_t size,
5384 const char *errors,
5385 int *byteorder,
5386 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005387{
5388 const char *starts = s;
5389 Py_ssize_t startinpos;
5390 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005391 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005392 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005393 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005394 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396 PyObject *errorHandler = NULL;
5397 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005398
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399 q = (unsigned char *)s;
5400 e = q + size;
5401
5402 if (byteorder)
5403 bo = *byteorder;
5404
5405 /* Check for BOM marks (U+FEFF) in the input and adjust current
5406 byte order setting accordingly. In native mode, the leading BOM
5407 mark is skipped, in all other modes, it is copied to the output
5408 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005409 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005410 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005411 if (bom == 0x0000FEFF) {
5412 bo = -1;
5413 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005415 else if (bom == 0xFFFE0000) {
5416 bo = 1;
5417 q += 4;
5418 }
5419 if (byteorder)
5420 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421 }
5422
Victor Stinnere64322e2012-10-30 23:12:47 +01005423 if (q == e) {
5424 if (consumed)
5425 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005426 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427 }
5428
Victor Stinnere64322e2012-10-30 23:12:47 +01005429#ifdef WORDS_BIGENDIAN
5430 le = bo < 0;
5431#else
5432 le = bo <= 0;
5433#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005434 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005435
Victor Stinner8f674cc2013-04-17 23:02:17 +02005436 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005437 writer.min_length = (e - q + 3) / 4;
5438 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005439 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005440
Victor Stinnere64322e2012-10-30 23:12:47 +01005441 while (1) {
5442 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005443 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005444
Victor Stinnere64322e2012-10-30 23:12:47 +01005445 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005446 enum PyUnicode_Kind kind = writer.kind;
5447 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005448 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005449 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005450 if (le) {
5451 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005452 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005453 if (ch > maxch)
5454 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005455 if (kind != PyUnicode_1BYTE_KIND &&
5456 Py_UNICODE_IS_SURROGATE(ch))
5457 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005458 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005459 q += 4;
5460 } while (q <= last);
5461 }
5462 else {
5463 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005464 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005465 if (ch > maxch)
5466 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005467 if (kind != PyUnicode_1BYTE_KIND &&
5468 Py_UNICODE_IS_SURROGATE(ch))
5469 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005470 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005471 q += 4;
5472 } while (q <= last);
5473 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005474 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005475 }
5476
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005477 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005478 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 startinpos = ((const char *)q) - starts;
5480 endinpos = startinpos + 4;
5481 }
5482 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005483 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005485 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005487 startinpos = ((const char *)q) - starts;
5488 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005490 else {
5491 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005492 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005493 goto onError;
5494 q += 4;
5495 continue;
5496 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005497 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005498 startinpos = ((const char *)q) - starts;
5499 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005501
5502 /* The remaining input chars are ignored if the callback
5503 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005504 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005508 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005510 }
5511
Walter Dörwald41980ca2007-08-16 21:55:45 +00005512 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005514
Walter Dörwald41980ca2007-08-16 21:55:45 +00005515 Py_XDECREF(errorHandler);
5516 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005517 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005521 Py_XDECREF(errorHandler);
5522 Py_XDECREF(exc);
5523 return NULL;
5524}
5525
5526PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005527_PyUnicode_EncodeUTF32(PyObject *str,
5528 const char *errors,
5529 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005530{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005531 enum PyUnicode_Kind kind;
5532 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005533 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005534 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005535 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005536#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005537 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005538#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005539 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005540#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005541 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005542 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005543 PyObject *errorHandler = NULL;
5544 PyObject *exc = NULL;
5545 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005546
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005547 if (!PyUnicode_Check(str)) {
5548 PyErr_BadArgument();
5549 return NULL;
5550 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005551 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005552 return NULL;
5553 kind = PyUnicode_KIND(str);
5554 data = PyUnicode_DATA(str);
5555 len = PyUnicode_GET_LENGTH(str);
5556
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005557 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005558 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005559 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005560 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005561 if (v == NULL)
5562 return NULL;
5563
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005564 /* output buffer is 4-bytes aligned */
5565 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005566 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005567 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005568 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005569 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005570 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005571
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005572 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005573 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005574 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005575 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005576 else
5577 encoding = "utf-32";
5578
5579 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005580 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5581 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005582 }
5583
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005584 pos = 0;
5585 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005586 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005587
5588 if (kind == PyUnicode_2BYTE_KIND) {
5589 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5590 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005591 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005592 else {
5593 assert(kind == PyUnicode_4BYTE_KIND);
5594 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5595 &out, native_ordering);
5596 }
5597 if (pos == len)
5598 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005599
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005600 rep = unicode_encode_call_errorhandler(
5601 errors, &errorHandler,
5602 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005603 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005604 if (!rep)
5605 goto error;
5606
5607 if (PyBytes_Check(rep)) {
5608 repsize = PyBytes_GET_SIZE(rep);
5609 if (repsize & 3) {
5610 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005611 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 "surrogates not allowed");
5613 goto error;
5614 }
5615 moreunits = repsize / 4;
5616 }
5617 else {
5618 assert(PyUnicode_Check(rep));
5619 if (PyUnicode_READY(rep) < 0)
5620 goto error;
5621 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5622 if (!PyUnicode_IS_ASCII(rep)) {
5623 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005624 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005625 "surrogates not allowed");
5626 goto error;
5627 }
5628 }
5629
5630 /* four bytes are reserved for each surrogate */
5631 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005632 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005633 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005634 /* integer overflow */
5635 PyErr_NoMemory();
5636 goto error;
5637 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005638 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005639 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005640 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005641 }
5642
5643 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005644 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005645 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005646 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005647 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5649 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 }
5651
5652 Py_CLEAR(rep);
5653 }
5654
5655 /* Cut back to size actually needed. This is necessary for, for example,
5656 encoding of a string containing isolated surrogates and the 'ignore'
5657 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005658 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005659 if (nsize != PyBytes_GET_SIZE(v))
5660 _PyBytes_Resize(&v, nsize);
5661 Py_XDECREF(errorHandler);
5662 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005663 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005664 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 error:
5666 Py_XDECREF(rep);
5667 Py_XDECREF(errorHandler);
5668 Py_XDECREF(exc);
5669 Py_XDECREF(v);
5670 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005671}
5672
Alexander Belopolsky40018472011-02-26 01:02:56 +00005673PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005674PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5675 Py_ssize_t size,
5676 const char *errors,
5677 int byteorder)
5678{
5679 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005680 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005681 if (tmp == NULL)
5682 return NULL;
5683 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5684 Py_DECREF(tmp);
5685 return result;
5686}
5687
5688PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005689PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005690{
Victor Stinnerb960b342011-11-20 19:12:52 +01005691 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005692}
5693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694/* --- UTF-16 Codec ------------------------------------------------------- */
5695
Tim Peters772747b2001-08-09 22:21:55 +00005696PyObject *
5697PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 Py_ssize_t size,
5699 const char *errors,
5700 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Walter Dörwald69652032004-09-07 20:24:22 +00005702 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5703}
5704
5705PyObject *
5706PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 Py_ssize_t size,
5708 const char *errors,
5709 int *byteorder,
5710 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005711{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005713 Py_ssize_t startinpos;
5714 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005715 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005716 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005717 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005718 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005719 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 PyObject *errorHandler = NULL;
5721 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
Tim Peters772747b2001-08-09 22:21:55 +00005724 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005725 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
5727 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005728 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005730 /* Check for BOM marks (U+FEFF) in the input and adjust current
5731 byte order setting accordingly. In native mode, the leading BOM
5732 mark is skipped, in all other modes, it is copied to the output
5733 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005734 if (bo == 0 && size >= 2) {
5735 const Py_UCS4 bom = (q[1] << 8) | q[0];
5736 if (bom == 0xFEFF) {
5737 q += 2;
5738 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005740 else if (bom == 0xFFFE) {
5741 q += 2;
5742 bo = 1;
5743 }
5744 if (byteorder)
5745 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Antoine Pitrou63065d72012-05-15 23:48:04 +02005748 if (q == e) {
5749 if (consumed)
5750 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005751 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005752 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005753
Christian Heimes743e0cd2012-10-17 23:52:17 +02005754#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005755 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005756 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005757#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005758 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005760#endif
Tim Peters772747b2001-08-09 22:21:55 +00005761
Antoine Pitrou63065d72012-05-15 23:48:04 +02005762 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005763 character count normally. Error handler will take care of
5764 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005765 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005766 writer.min_length = (e - q + 1) / 2;
5767 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005769
Antoine Pitrou63065d72012-05-15 23:48:04 +02005770 while (1) {
5771 Py_UCS4 ch = 0;
5772 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005773 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005774 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005775 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005776 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005778 native_ordering);
5779 else
5780 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005781 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005782 native_ordering);
5783 } else if (kind == PyUnicode_2BYTE_KIND) {
5784 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005786 native_ordering);
5787 } else {
5788 assert(kind == PyUnicode_4BYTE_KIND);
5789 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005790 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005791 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005792 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794
Antoine Pitrou63065d72012-05-15 23:48:04 +02005795 switch (ch)
5796 {
5797 case 0:
5798 /* remaining byte at the end? (size should be even) */
5799 if (q == e || consumed)
5800 goto End;
5801 errmsg = "truncated data";
5802 startinpos = ((const char *)q) - starts;
5803 endinpos = ((const char *)e) - starts;
5804 break;
5805 /* The remaining input chars are ignored if the callback
5806 chooses to skip the input */
5807 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005808 q -= 2;
5809 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005810 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005811 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005812 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005813 endinpos = ((const char *)e) - starts;
5814 break;
5815 case 2:
5816 errmsg = "illegal encoding";
5817 startinpos = ((const char *)q) - 2 - starts;
5818 endinpos = startinpos + 2;
5819 break;
5820 case 3:
5821 errmsg = "illegal UTF-16 surrogate";
5822 startinpos = ((const char *)q) - 4 - starts;
5823 endinpos = startinpos + 2;
5824 break;
5825 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005826 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005827 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 continue;
5829 }
5830
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005831 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005832 errors,
5833 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005835 &starts,
5836 (const char **)&e,
5837 &startinpos,
5838 &endinpos,
5839 &exc,
5840 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005841 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 }
5844
Antoine Pitrou63065d72012-05-15 23:48:04 +02005845End:
Walter Dörwald69652032004-09-07 20:24:22 +00005846 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005848
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 Py_XDECREF(errorHandler);
5850 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005854 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 Py_XDECREF(errorHandler);
5856 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 return NULL;
5858}
5859
Tim Peters772747b2001-08-09 22:21:55 +00005860PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005861_PyUnicode_EncodeUTF16(PyObject *str,
5862 const char *errors,
5863 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005865 enum PyUnicode_Kind kind;
5866 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005868 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005869 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005870 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005871#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005872 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005873#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005874 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005875#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005876 const char *encoding;
5877 Py_ssize_t nsize, pos;
5878 PyObject *errorHandler = NULL;
5879 PyObject *exc = NULL;
5880 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005881
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 if (!PyUnicode_Check(str)) {
5883 PyErr_BadArgument();
5884 return NULL;
5885 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005886 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887 return NULL;
5888 kind = PyUnicode_KIND(str);
5889 data = PyUnicode_DATA(str);
5890 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005891
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005893 if (kind == PyUnicode_4BYTE_KIND) {
5894 const Py_UCS4 *in = (const Py_UCS4 *)data;
5895 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005896 while (in < end) {
5897 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005899 }
5900 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005901 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005902 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005904 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005905 nsize = len + pairs + (byteorder == 0);
5906 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005907 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005911 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005912 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005913 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005914 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005915 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005916 }
5917 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005918 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005919 }
Tim Peters772747b2001-08-09 22:21:55 +00005920
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005921 if (kind == PyUnicode_1BYTE_KIND) {
5922 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5923 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005924 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005925
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005926 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005927 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005928 }
5929 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005930 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005931 }
5932 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005933 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005934 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935
5936 pos = 0;
5937 while (pos < len) {
5938 Py_ssize_t repsize, moreunits;
5939
5940 if (kind == PyUnicode_2BYTE_KIND) {
5941 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5942 &out, native_ordering);
5943 }
5944 else {
5945 assert(kind == PyUnicode_4BYTE_KIND);
5946 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5947 &out, native_ordering);
5948 }
5949 if (pos == len)
5950 break;
5951
5952 rep = unicode_encode_call_errorhandler(
5953 errors, &errorHandler,
5954 encoding, "surrogates not allowed",
5955 str, &exc, pos, pos + 1, &pos);
5956 if (!rep)
5957 goto error;
5958
5959 if (PyBytes_Check(rep)) {
5960 repsize = PyBytes_GET_SIZE(rep);
5961 if (repsize & 1) {
5962 raise_encode_exception(&exc, encoding,
5963 str, pos - 1, pos,
5964 "surrogates not allowed");
5965 goto error;
5966 }
5967 moreunits = repsize / 2;
5968 }
5969 else {
5970 assert(PyUnicode_Check(rep));
5971 if (PyUnicode_READY(rep) < 0)
5972 goto error;
5973 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5974 if (!PyUnicode_IS_ASCII(rep)) {
5975 raise_encode_exception(&exc, encoding,
5976 str, pos - 1, pos,
5977 "surrogates not allowed");
5978 goto error;
5979 }
5980 }
5981
5982 /* two bytes are reserved for each surrogate */
5983 if (moreunits > 1) {
5984 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005985 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005986 /* integer overflow */
5987 PyErr_NoMemory();
5988 goto error;
5989 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005990 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005991 goto error;
5992 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5993 }
5994
5995 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005996 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005997 out += moreunits;
5998 } else /* rep is unicode */ {
5999 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6000 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6001 &out, native_ordering);
6002 }
6003
6004 Py_CLEAR(rep);
6005 }
6006
6007 /* Cut back to size actually needed. This is necessary for, for example,
6008 encoding of a string containing isolated surrogates and the 'ignore' handler
6009 is used. */
6010 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6011 if (nsize != PyBytes_GET_SIZE(v))
6012 _PyBytes_Resize(&v, nsize);
6013 Py_XDECREF(errorHandler);
6014 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006015 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006016 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006017 error:
6018 Py_XDECREF(rep);
6019 Py_XDECREF(errorHandler);
6020 Py_XDECREF(exc);
6021 Py_XDECREF(v);
6022 return NULL;
6023#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024}
6025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6028 Py_ssize_t size,
6029 const char *errors,
6030 int byteorder)
6031{
6032 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006033 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 if (tmp == NULL)
6035 return NULL;
6036 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6037 Py_DECREF(tmp);
6038 return result;
6039}
6040
6041PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006042PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006044 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045}
6046
6047/* --- Unicode Escape Codec ----------------------------------------------- */
6048
Fredrik Lundh06d12682001-01-24 07:59:11 +00006049static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006050
Alexander Belopolsky40018472011-02-26 01:02:56 +00006051PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006052_PyUnicode_DecodeUnicodeEscape(const char *s,
6053 Py_ssize_t size,
6054 const char *errors,
6055 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006057 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006058 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 PyObject *errorHandler = NULL;
6061 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062
Eric V. Smith42454af2016-10-31 09:22:08 -04006063 // so we can remember if we've seen an invalid escape char or not
6064 *first_invalid_escape = NULL;
6065
Victor Stinner62ec3312016-09-06 17:04:34 -07006066 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006067 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006068 }
6069 /* Escaped strings will always be longer than the resulting
6070 Unicode string, so we start with size here and then reduce the
6071 length after conversion to the true value.
6072 (but if the error callback returns a long replacement string
6073 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006074 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 writer.min_length = size;
6076 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6077 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006078 }
6079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 end = s + size;
6081 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 unsigned char c = (unsigned char) *s++;
6083 Py_UCS4 ch;
6084 int count;
6085 Py_ssize_t startinpos;
6086 Py_ssize_t endinpos;
6087 const char *message;
6088
6089#define WRITE_ASCII_CHAR(ch) \
6090 do { \
6091 assert(ch <= 127); \
6092 assert(writer.pos < writer.size); \
6093 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6094 } while(0)
6095
6096#define WRITE_CHAR(ch) \
6097 do { \
6098 if (ch <= writer.maxchar) { \
6099 assert(writer.pos < writer.size); \
6100 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6101 } \
6102 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6103 goto onError; \
6104 } \
6105 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
6107 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006108 if (c != '\\') {
6109 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 continue;
6111 }
6112
Victor Stinner62ec3312016-09-06 17:04:34 -07006113 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006115 if (s >= end) {
6116 message = "\\ at end of string";
6117 goto error;
6118 }
6119 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006120
Victor Stinner62ec3312016-09-06 17:04:34 -07006121 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006122 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006125 case '\n': continue;
6126 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6127 case '\'': WRITE_ASCII_CHAR('\''); continue;
6128 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6129 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006130 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006131 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6132 case 't': WRITE_ASCII_CHAR('\t'); continue;
6133 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6134 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006135 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006136 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006137 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006138 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 case '0': case '1': case '2': case '3':
6142 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006143 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006144 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006145 ch = (ch<<3) + *s++ - '0';
6146 if (s < end && '0' <= *s && *s <= '7') {
6147 ch = (ch<<3) + *s++ - '0';
6148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006150 WRITE_CHAR(ch);
6151 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 /* hex escapes */
6154 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006157 message = "truncated \\xXX escape";
6158 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006163 message = "truncated \\uXXXX escape";
6164 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006167 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006168 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006169 message = "truncated \\UXXXXXXXX escape";
6170 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006172 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 ch <<= 4;
6174 if (c >= '0' && c <= '9') {
6175 ch += c - '0';
6176 }
6177 else if (c >= 'a' && c <= 'f') {
6178 ch += c - ('a' - 10);
6179 }
6180 else if (c >= 'A' && c <= 'F') {
6181 ch += c - ('A' - 10);
6182 }
6183 else {
6184 break;
6185 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006186 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006188 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 }
6190
6191 /* when we get here, ch is a 32-bit unicode character */
6192 if (ch > MAX_UNICODE) {
6193 message = "illegal Unicode character";
6194 goto error;
6195 }
6196
6197 WRITE_CHAR(ch);
6198 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006199
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006201 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006202 if (ucnhash_CAPI == NULL) {
6203 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006204 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6205 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006206 if (ucnhash_CAPI == NULL) {
6207 PyErr_SetString(
6208 PyExc_UnicodeError,
6209 "\\N escapes not supported (can't load unicodedata module)"
6210 );
6211 goto onError;
6212 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006213 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006214
6215 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006216 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 const char *start = ++s;
6218 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006219 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006220 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006221 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006222 namelen = s - start;
6223 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006224 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006225 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 ch = 0xffffffff; /* in case 'getcode' messes up */
6227 if (namelen <= INT_MAX &&
6228 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6229 &ch, 0)) {
6230 assert(ch <= MAX_UNICODE);
6231 WRITE_CHAR(ch);
6232 continue;
6233 }
6234 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006235 }
6236 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006237 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006238
6239 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006240 if (*first_invalid_escape == NULL) {
6241 *first_invalid_escape = s-1; /* Back up one char, since we've
6242 already incremented s. */
6243 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 WRITE_ASCII_CHAR('\\');
6245 WRITE_CHAR(c);
6246 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006248
6249 error:
6250 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006252 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006253 errors, &errorHandler,
6254 "unicodeescape", message,
6255 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006257 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006259 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006260
6261#undef WRITE_ASCII_CHAR
6262#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006264
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006265 Py_XDECREF(errorHandler);
6266 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006267 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006268
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006270 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271 Py_XDECREF(errorHandler);
6272 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 return NULL;
6274}
6275
Eric V. Smith42454af2016-10-31 09:22:08 -04006276PyObject *
6277PyUnicode_DecodeUnicodeEscape(const char *s,
6278 Py_ssize_t size,
6279 const char *errors)
6280{
6281 const char *first_invalid_escape;
6282 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6283 &first_invalid_escape);
6284 if (result == NULL)
6285 return NULL;
6286 if (first_invalid_escape != NULL) {
6287 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6288 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006289 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006290 Py_DECREF(result);
6291 return NULL;
6292 }
6293 }
6294 return result;
6295}
6296
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006297/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
Alexander Belopolsky40018472011-02-26 01:02:56 +00006299PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006300PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006302 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006306 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
Ezio Melottie7f90372012-10-05 03:33:31 +03006309 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006310 escape.
6311
Ezio Melottie7f90372012-10-05 03:33:31 +03006312 For UCS1 strings it's '\xxx', 4 bytes per source character.
6313 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6314 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006315 */
6316
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006317 if (!PyUnicode_Check(unicode)) {
6318 PyErr_BadArgument();
6319 return NULL;
6320 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006322 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006323 }
Victor Stinner358af132015-10-12 22:36:57 +02006324
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006326 if (len == 0) {
6327 return PyBytes_FromStringAndSize(NULL, 0);
6328 }
6329
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330 kind = PyUnicode_KIND(unicode);
6331 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6333 bytes, and 1 byte characters 4. */
6334 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006335 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 return PyErr_NoMemory();
6337 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006338 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 if (repr == NULL) {
6340 return NULL;
6341 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006342
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006344 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006345 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006346
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 /* U+0000-U+00ff range */
6348 if (ch < 0x100) {
6349 if (ch >= ' ' && ch < 127) {
6350 if (ch != '\\') {
6351 /* Copy printable US ASCII as-is */
6352 *p++ = (char) ch;
6353 }
6354 /* Escape backslashes */
6355 else {
6356 *p++ = '\\';
6357 *p++ = '\\';
6358 }
6359 }
Victor Stinner358af132015-10-12 22:36:57 +02006360
Victor Stinner62ec3312016-09-06 17:04:34 -07006361 /* Map special whitespace to '\t', \n', '\r' */
6362 else if (ch == '\t') {
6363 *p++ = '\\';
6364 *p++ = 't';
6365 }
6366 else if (ch == '\n') {
6367 *p++ = '\\';
6368 *p++ = 'n';
6369 }
6370 else if (ch == '\r') {
6371 *p++ = '\\';
6372 *p++ = 'r';
6373 }
6374
6375 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6376 else {
6377 *p++ = '\\';
6378 *p++ = 'x';
6379 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6380 *p++ = Py_hexdigits[ch & 0x000F];
6381 }
Tim Petersced69f82003-09-16 20:30:58 +00006382 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006383 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006384 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 *p++ = '\\';
6386 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006387 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6388 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6389 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6390 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006392 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6393 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006394
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 /* Make sure that the first two digits are zero */
6396 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006397 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006398 *p++ = 'U';
6399 *p++ = '0';
6400 *p++ = '0';
6401 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6402 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6403 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6404 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6405 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6406 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 assert(p - PyBytes_AS_STRING(repr) > 0);
6411 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6412 return NULL;
6413 }
6414 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415}
6416
Alexander Belopolsky40018472011-02-26 01:02:56 +00006417PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6419 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006422 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 }
6426
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427 result = PyUnicode_AsUnicodeEscapeString(tmp);
6428 Py_DECREF(tmp);
6429 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430}
6431
6432/* --- Raw Unicode Escape Codec ------------------------------------------- */
6433
Alexander Belopolsky40018472011-02-26 01:02:56 +00006434PyObject *
6435PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006436 Py_ssize_t size,
6437 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006440 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442 PyObject *errorHandler = NULL;
6443 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006444
Victor Stinner62ec3312016-09-06 17:04:34 -07006445 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006446 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006448
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 /* Escaped strings will always be longer than the resulting
6450 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 length after conversion to the true value. (But decoding error
6452 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006453 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 writer.min_length = size;
6455 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6456 goto onError;
6457 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 end = s + size;
6460 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 unsigned char c = (unsigned char) *s++;
6462 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006463 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 Py_ssize_t startinpos;
6465 Py_ssize_t endinpos;
6466 const char *message;
6467
6468#define WRITE_CHAR(ch) \
6469 do { \
6470 if (ch <= writer.maxchar) { \
6471 assert(writer.pos < writer.size); \
6472 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6473 } \
6474 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6475 goto onError; \
6476 } \
6477 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006480 if (c != '\\' || s >= end) {
6481 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006483 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006484
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 c = (unsigned char) *s++;
6486 if (c == 'u') {
6487 count = 4;
6488 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 else if (c == 'U') {
6491 count = 8;
6492 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006493 }
6494 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 assert(writer.pos < writer.size);
6496 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6497 WRITE_CHAR(c);
6498 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006499 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 startinpos = s - starts - 2;
6501
6502 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6503 for (ch = 0; count && s < end; ++s, --count) {
6504 c = (unsigned char)*s;
6505 ch <<= 4;
6506 if (c >= '0' && c <= '9') {
6507 ch += c - '0';
6508 }
6509 else if (c >= 'a' && c <= 'f') {
6510 ch += c - ('a' - 10);
6511 }
6512 else if (c >= 'A' && c <= 'F') {
6513 ch += c - ('A' - 10);
6514 }
6515 else {
6516 break;
6517 }
6518 }
6519 if (!count) {
6520 if (ch <= MAX_UNICODE) {
6521 WRITE_CHAR(ch);
6522 continue;
6523 }
6524 message = "\\Uxxxxxxxx out of range";
6525 }
6526
6527 endinpos = s-starts;
6528 writer.min_length = end - s + writer.pos;
6529 if (unicode_decode_call_errorhandler_writer(
6530 errors, &errorHandler,
6531 "rawunicodeescape", message,
6532 &starts, &end, &startinpos, &endinpos, &exc, &s,
6533 &writer)) {
6534 goto onError;
6535 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006536 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006537
6538#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006540 Py_XDECREF(errorHandler);
6541 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006542 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006543
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006545 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006546 Py_XDECREF(errorHandler);
6547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550}
6551
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006552
Alexander Belopolsky40018472011-02-26 01:02:56 +00006553PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006554PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555{
Victor Stinner62ec3312016-09-06 17:04:34 -07006556 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006558 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006559 int kind;
6560 void *data;
6561 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006563 if (!PyUnicode_Check(unicode)) {
6564 PyErr_BadArgument();
6565 return NULL;
6566 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006567 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006568 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006570 kind = PyUnicode_KIND(unicode);
6571 data = PyUnicode_DATA(unicode);
6572 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006573 if (kind == PyUnicode_1BYTE_KIND) {
6574 return PyBytes_FromStringAndSize(data, len);
6575 }
Victor Stinner0e368262011-11-10 20:12:49 +01006576
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6578 bytes, and 1 byte characters 4. */
6579 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006580
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 if (len > PY_SSIZE_T_MAX / expandsize) {
6582 return PyErr_NoMemory();
6583 }
6584 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6585 if (repr == NULL) {
6586 return NULL;
6587 }
6588 if (len == 0) {
6589 return repr;
6590 }
6591
6592 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006593 for (pos = 0; pos < len; pos++) {
6594 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006595
Victor Stinner62ec3312016-09-06 17:04:34 -07006596 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6597 if (ch < 0x100) {
6598 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006599 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006600 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006601 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 *p++ = '\\';
6603 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006604 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6605 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6606 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6607 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006609 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6610 else {
6611 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6612 *p++ = '\\';
6613 *p++ = 'U';
6614 *p++ = '0';
6615 *p++ = '0';
6616 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6617 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6618 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6619 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6620 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6621 *p++ = Py_hexdigits[ch & 15];
6622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006624
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 assert(p > PyBytes_AS_STRING(repr));
6626 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6627 return NULL;
6628 }
6629 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630}
6631
Alexander Belopolsky40018472011-02-26 01:02:56 +00006632PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006633PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6634 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006636 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006637 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006638 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006639 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006640 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6641 Py_DECREF(tmp);
6642 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643}
6644
6645/* --- Latin-1 Codec ------------------------------------------------------ */
6646
Alexander Belopolsky40018472011-02-26 01:02:56 +00006647PyObject *
6648PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006649 Py_ssize_t size,
6650 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006653 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654}
6655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006657static void
6658make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006659 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006660 PyObject *unicode,
6661 Py_ssize_t startpos, Py_ssize_t endpos,
6662 const char *reason)
6663{
6664 if (*exceptionObject == NULL) {
6665 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006667 encoding, unicode, startpos, endpos, reason);
6668 }
6669 else {
6670 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6671 goto onError;
6672 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6673 goto onError;
6674 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6675 goto onError;
6676 return;
6677 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006678 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006679 }
6680}
6681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006683static void
6684raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006685 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006686 PyObject *unicode,
6687 Py_ssize_t startpos, Py_ssize_t endpos,
6688 const char *reason)
6689{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006690 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006691 encoding, unicode, startpos, endpos, reason);
6692 if (*exceptionObject != NULL)
6693 PyCodec_StrictErrors(*exceptionObject);
6694}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695
6696/* error handling callback helper:
6697 build arguments, call the callback and check the arguments,
6698 put the result into newpos and return the replacement string, which
6699 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006700static PyObject *
6701unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006702 PyObject **errorHandler,
6703 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006704 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006705 Py_ssize_t startpos, Py_ssize_t endpos,
6706 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006708 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710 PyObject *restuple;
6711 PyObject *resunicode;
6712
6713 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 }
6718
Benjamin Petersonbac79492012-01-14 13:34:47 -05006719 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 return NULL;
6721 len = PyUnicode_GET_LENGTH(unicode);
6722
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006723 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006728 restuple = PyObject_CallFunctionObjArgs(
6729 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006733 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 Py_DECREF(restuple);
6735 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006737 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 &resunicode, newpos)) {
6739 Py_DECREF(restuple);
6740 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006742 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6743 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6744 Py_DECREF(restuple);
6745 return NULL;
6746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006748 *newpos = len + *newpos;
6749 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006750 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 Py_DECREF(restuple);
6752 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006753 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 Py_INCREF(resunicode);
6755 Py_DECREF(restuple);
6756 return resunicode;
6757}
6758
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006761 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006762 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006763{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764 /* input state */
6765 Py_ssize_t pos=0, size;
6766 int kind;
6767 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006768 /* pointer into the output */
6769 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006770 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6771 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006772 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006774 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006775 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006776 /* output object */
6777 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778
Benjamin Petersonbac79492012-01-14 13:34:47 -05006779 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 return NULL;
6781 size = PyUnicode_GET_LENGTH(unicode);
6782 kind = PyUnicode_KIND(unicode);
6783 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784 /* allocate enough for a simple encoding without
6785 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006786 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006787 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006788
6789 _PyBytesWriter_Init(&writer);
6790 str = _PyBytesWriter_Alloc(&writer, size);
6791 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006795 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006796
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006798 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006800 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006801 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006802 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006804 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006806 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006807 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006809
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006810 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006812
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006813 /* Only overallocate the buffer if it's not the last write */
6814 writer.overallocate = (collend < size);
6815
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006817 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006818 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006819
6820 switch (error_handler) {
6821 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006822 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006824
6825 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006826 memset(str, '?', collend - collstart);
6827 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006828 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006829 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006830 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 break;
Victor Stinner50149202015-09-22 00:26:54 +02006832
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006833 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006834 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006835 writer.min_size -= (collend - collstart);
6836 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006837 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006838 if (str == NULL)
6839 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006840 pos = collend;
6841 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006842
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006843 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006844 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006845 writer.min_size -= (collend - collstart);
6846 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006847 unicode, collstart, collend);
6848 if (str == NULL)
6849 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006850 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 break;
Victor Stinner50149202015-09-22 00:26:54 +02006852
Victor Stinnerc3713e92015-09-29 12:32:13 +02006853 case _Py_ERROR_SURROGATEESCAPE:
6854 for (i = collstart; i < collend; ++i) {
6855 ch = PyUnicode_READ(kind, data, i);
6856 if (ch < 0xdc80 || 0xdcff < ch) {
6857 /* Not a UTF-8b surrogate */
6858 break;
6859 }
6860 *str++ = (char)(ch - 0xdc00);
6861 ++pos;
6862 }
6863 if (i >= collend)
6864 break;
6865 collstart = pos;
6866 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006867 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006868
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006870 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6871 encoding, reason, unicode, &exc,
6872 collstart, collend, &newpos);
6873 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006875
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006876 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006877 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006878
Victor Stinner6bd525b2015-10-09 13:10:05 +02006879 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006880 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006881 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006882 PyBytes_AS_STRING(rep),
6883 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006884 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006885 else {
6886 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006887
Victor Stinner6bd525b2015-10-09 13:10:05 +02006888 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006890
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006891 if (limit == 256 ?
6892 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6893 !PyUnicode_IS_ASCII(rep))
6894 {
6895 /* Not all characters are smaller than limit */
6896 raise_encode_exception(&exc, encoding, unicode,
6897 collstart, collend, reason);
6898 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006900 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6901 str = _PyBytesWriter_WriteBytes(&writer, str,
6902 PyUnicode_DATA(rep),
6903 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006905 if (str == NULL)
6906 goto onError;
6907
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006909 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006910 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006911
6912 /* If overallocation was disabled, ensure that it was the last
6913 write. Otherwise, we missed an optimization */
6914 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006915 }
6916 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006917
Victor Stinner50149202015-09-22 00:26:54 +02006918 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006920 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006921
6922 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006923 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006924 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006925 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006926 Py_XDECREF(exc);
6927 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928}
6929
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006931PyObject *
6932PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006933 Py_ssize_t size,
6934 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006936 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006937 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006938 if (unicode == NULL)
6939 return NULL;
6940 result = unicode_encode_ucs1(unicode, errors, 256);
6941 Py_DECREF(unicode);
6942 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943}
6944
Alexander Belopolsky40018472011-02-26 01:02:56 +00006945PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006946_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947{
6948 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 PyErr_BadArgument();
6950 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006952 if (PyUnicode_READY(unicode) == -1)
6953 return NULL;
6954 /* Fast path: if it is a one-byte string, construct
6955 bytes object directly. */
6956 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6957 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6958 PyUnicode_GET_LENGTH(unicode));
6959 /* Non-Latin-1 characters present. Defer to above function to
6960 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006961 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006962}
6963
6964PyObject*
6965PyUnicode_AsLatin1String(PyObject *unicode)
6966{
6967 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968}
6969
6970/* --- 7-bit ASCII Codec -------------------------------------------------- */
6971
Alexander Belopolsky40018472011-02-26 01:02:56 +00006972PyObject *
6973PyUnicode_DecodeASCII(const char *s,
6974 Py_ssize_t size,
6975 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006977 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006978 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006979 int kind;
6980 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006981 Py_ssize_t startinpos;
6982 Py_ssize_t endinpos;
6983 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006984 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006985 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006986 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006988
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006990 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006991
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006993 if (size == 1 && (unsigned char)s[0] < 128)
6994 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006995
Victor Stinner8f674cc2013-04-17 23:02:17 +02006996 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006997 writer.min_length = size;
6998 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006999 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007002 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007003 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 writer.pos = outpos;
7005 if (writer.pos == size)
7006 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007007
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 s += writer.pos;
7009 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007011 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 PyUnicode_WRITE(kind, data, writer.pos, c);
7014 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007016 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007018
7019 /* byte outsize range 0x00..0x7f: call the error handler */
7020
7021 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007022 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007023
7024 switch (error_handler)
7025 {
7026 case _Py_ERROR_REPLACE:
7027 case _Py_ERROR_SURROGATEESCAPE:
7028 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007029 but we may switch to UCS2 at the first write */
7030 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7031 goto onError;
7032 kind = writer.kind;
7033 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007034
7035 if (error_handler == _Py_ERROR_REPLACE)
7036 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7037 else
7038 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7039 writer.pos++;
7040 ++s;
7041 break;
7042
7043 case _Py_ERROR_IGNORE:
7044 ++s;
7045 break;
7046
7047 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 startinpos = s-starts;
7049 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007050 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007051 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 "ascii", "ordinal not in range(128)",
7053 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007054 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007056 kind = writer.kind;
7057 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007060 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007062 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007063
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007065 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007066 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007067 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 return NULL;
7069}
7070
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007071/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007072PyObject *
7073PyUnicode_EncodeASCII(const Py_UNICODE *p,
7074 Py_ssize_t size,
7075 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007077 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007078 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007079 if (unicode == NULL)
7080 return NULL;
7081 result = unicode_encode_ucs1(unicode, errors, 128);
7082 Py_DECREF(unicode);
7083 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084}
7085
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007087_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088{
7089 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 PyErr_BadArgument();
7091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007093 if (PyUnicode_READY(unicode) == -1)
7094 return NULL;
7095 /* Fast path: if it is an ASCII-only string, construct bytes object
7096 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007097 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7099 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007100 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007101}
7102
7103PyObject *
7104PyUnicode_AsASCIIString(PyObject *unicode)
7105{
7106 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107}
7108
Steve Dowercc16be82016-09-08 10:35:16 -07007109#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007110
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007111/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007112
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007113#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114#define NEED_RETRY
7115#endif
7116
Victor Stinner3a50e702011-10-18 21:21:00 +02007117#ifndef WC_ERR_INVALID_CHARS
7118# define WC_ERR_INVALID_CHARS 0x0080
7119#endif
7120
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007121static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007122code_page_name(UINT code_page, PyObject **obj)
7123{
7124 *obj = NULL;
7125 if (code_page == CP_ACP)
7126 return "mbcs";
7127 if (code_page == CP_UTF7)
7128 return "CP_UTF7";
7129 if (code_page == CP_UTF8)
7130 return "CP_UTF8";
7131
7132 *obj = PyBytes_FromFormat("cp%u", code_page);
7133 if (*obj == NULL)
7134 return NULL;
7135 return PyBytes_AS_STRING(*obj);
7136}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137
Victor Stinner3a50e702011-10-18 21:21:00 +02007138static DWORD
7139decode_code_page_flags(UINT code_page)
7140{
7141 if (code_page == CP_UTF7) {
7142 /* The CP_UTF7 decoder only supports flags=0 */
7143 return 0;
7144 }
7145 else
7146 return MB_ERR_INVALID_CHARS;
7147}
7148
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 * Decode a byte string from a Windows code page into unicode object in strict
7151 * mode.
7152 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007153 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7154 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007156static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007157decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007158 wchar_t **buf,
7159 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 const char *in,
7161 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007162{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007163 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007164 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166
7167 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007169 while ((outsize = MultiByteToWideChar(code_page, flags,
7170 in, insize, NULL, 0)) <= 0)
7171 {
7172 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7173 goto error;
7174 }
7175 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7176 flags = 0;
7177 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007178
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007179 /* Extend a wchar_t* buffer */
7180 Py_ssize_t n = *bufsize; /* Get the current length */
7181 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7182 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007184 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185
7186 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7188 if (outsize <= 0)
7189 goto error;
7190 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192error:
7193 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7194 return -2;
7195 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007196 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197}
7198
Victor Stinner3a50e702011-10-18 21:21:00 +02007199/*
7200 * Decode a byte string from a code page into unicode object with an error
7201 * handler.
7202 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007203 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 * UnicodeDecodeError exception and returns -1 on error.
7205 */
7206static int
7207decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007208 wchar_t **buf,
7209 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007211 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007212{
7213 const char *startin = in;
7214 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007215 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 /* Ideally, we should get reason from FormatMessage. This is the Windows
7217 2000 English version of the message. */
7218 const char *reason = "No mapping for the Unicode character exists "
7219 "in the target code page.";
7220 /* each step cannot decode more than 1 character, but a character can be
7221 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007222 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007223 int insize;
7224 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 PyObject *errorHandler = NULL;
7226 PyObject *exc = NULL;
7227 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007228 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 DWORD err;
7230 int ret = -1;
7231
7232 assert(size > 0);
7233
7234 encoding = code_page_name(code_page, &encoding_obj);
7235 if (encoding == NULL)
7236 return -1;
7237
Victor Stinner7d00cc12014-03-17 23:08:06 +01007238 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7240 UnicodeDecodeError. */
7241 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7242 if (exc != NULL) {
7243 PyCodec_StrictErrors(exc);
7244 Py_CLEAR(exc);
7245 }
7246 goto error;
7247 }
7248
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007249 /* Extend a wchar_t* buffer */
7250 Py_ssize_t n = *bufsize; /* Get the current length */
7251 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7252 PyErr_NoMemory();
7253 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007255 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7256 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007258 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007259
7260 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 while (in < endin)
7262 {
7263 /* Decode a character */
7264 insize = 1;
7265 do
7266 {
7267 outsize = MultiByteToWideChar(code_page, flags,
7268 in, insize,
7269 buffer, Py_ARRAY_LENGTH(buffer));
7270 if (outsize > 0)
7271 break;
7272 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007273 if (err == ERROR_INVALID_FLAGS && flags) {
7274 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7275 flags = 0;
7276 continue;
7277 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 if (err != ERROR_NO_UNICODE_TRANSLATION
7279 && err != ERROR_INSUFFICIENT_BUFFER)
7280 {
7281 PyErr_SetFromWindowsErr(0);
7282 goto error;
7283 }
7284 insize++;
7285 }
7286 /* 4=maximum length of a UTF-8 sequence */
7287 while (insize <= 4 && (in + insize) <= endin);
7288
7289 if (outsize <= 0) {
7290 Py_ssize_t startinpos, endinpos, outpos;
7291
Victor Stinner7d00cc12014-03-17 23:08:06 +01007292 /* last character in partial decode? */
7293 if (in + insize >= endin && !final)
7294 break;
7295
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 startinpos = in - startin;
7297 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007298 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007299 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 errors, &errorHandler,
7301 encoding, reason,
7302 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007303 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 {
7305 goto error;
7306 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007307 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 }
7309 else {
7310 in += insize;
7311 memcpy(out, buffer, outsize * sizeof(wchar_t));
7312 out += outsize;
7313 }
7314 }
7315
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007316 /* Shrink the buffer */
7317 assert(out - *buf <= *bufsize);
7318 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007319 /* (in - startin) <= size and size is an int */
7320 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007321
7322error:
7323 Py_XDECREF(encoding_obj);
7324 Py_XDECREF(errorHandler);
7325 Py_XDECREF(exc);
7326 return ret;
7327}
7328
Victor Stinner3a50e702011-10-18 21:21:00 +02007329static PyObject *
7330decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 const char *s, Py_ssize_t size,
7332 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007334 wchar_t *buf = NULL;
7335 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 if (code_page < 0) {
7339 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7340 return NULL;
7341 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007342 if (size < 0) {
7343 PyErr_BadInternalCall();
7344 return NULL;
7345 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007346
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349
Victor Stinner76a31a62011-11-04 00:05:13 +01007350 do
7351 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007353 if (size > INT_MAX) {
7354 chunk_size = INT_MAX;
7355 final = 0;
7356 done = 0;
7357 }
7358 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007360 {
7361 chunk_size = (int)size;
7362 final = (consumed == NULL);
7363 done = 1;
7364 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365
Victor Stinner76a31a62011-11-04 00:05:13 +01007366 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007367 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007368 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007369 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007370 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007372 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007373 s, chunk_size);
7374 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007375 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007377 errors, final);
7378 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007379
7380 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007381 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 return NULL;
7383 }
7384
7385 if (consumed)
7386 *consumed += converted;
7387
7388 s += converted;
7389 size -= converted;
7390 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007391
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007392 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7393 PyMem_Free(buf);
7394 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395}
7396
Alexander Belopolsky40018472011-02-26 01:02:56 +00007397PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007398PyUnicode_DecodeCodePageStateful(int code_page,
7399 const char *s,
7400 Py_ssize_t size,
7401 const char *errors,
7402 Py_ssize_t *consumed)
7403{
7404 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7405}
7406
7407PyObject *
7408PyUnicode_DecodeMBCSStateful(const char *s,
7409 Py_ssize_t size,
7410 const char *errors,
7411 Py_ssize_t *consumed)
7412{
7413 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7414}
7415
7416PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007417PyUnicode_DecodeMBCS(const char *s,
7418 Py_ssize_t size,
7419 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007420{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7422}
7423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424static DWORD
7425encode_code_page_flags(UINT code_page, const char *errors)
7426{
7427 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007428 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 }
7430 else if (code_page == CP_UTF7) {
7431 /* CP_UTF7 only supports flags=0 */
7432 return 0;
7433 }
7434 else {
7435 if (errors != NULL && strcmp(errors, "replace") == 0)
7436 return 0;
7437 else
7438 return WC_NO_BEST_FIT_CHARS;
7439 }
7440}
7441
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 * Encode a Unicode string to a Windows code page into a byte string in strict
7444 * mode.
7445 *
7446 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007447 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007449static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007450encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453{
Victor Stinner554f3f02010-06-16 23:33:54 +00007454 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 BOOL *pusedDefaultChar = &usedDefaultChar;
7456 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007457 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007458 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 const DWORD flags = encode_code_page_flags(code_page, NULL);
7460 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 /* Create a substring so that we can get the UTF-16 representation
7462 of just the slice under consideration. */
7463 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464
Martin v. Löwis3d325192011-11-04 18:23:06 +01007465 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007466
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007468 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007470 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007471
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 substring = PyUnicode_Substring(unicode, offset, offset+len);
7473 if (substring == NULL)
7474 return -1;
7475 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7476 if (p == NULL) {
7477 Py_DECREF(substring);
7478 return -1;
7479 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007480 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007482 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007484 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 NULL, 0,
7486 NULL, pusedDefaultChar);
7487 if (outsize <= 0)
7488 goto error;
7489 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007490 if (pusedDefaultChar && *pusedDefaultChar) {
7491 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 if (*outbytes == NULL) {
7499 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007501 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007503 }
7504 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 const Py_ssize_t n = PyBytes_Size(*outbytes);
7507 if (outsize > PY_SSIZE_T_MAX - n) {
7508 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007509 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007512 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7513 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007517 }
7518
7519 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007521 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 out, outsize,
7523 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007524 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 if (outsize <= 0)
7526 goto error;
7527 if (pusedDefaultChar && *pusedDefaultChar)
7528 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007529 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007530
Victor Stinner3a50e702011-10-18 21:21:00 +02007531error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007532 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7534 return -2;
7535 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007536 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007537}
7538
Victor Stinner3a50e702011-10-18 21:21:00 +02007539/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007540 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 * error handler.
7542 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007543 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 * -1 on other error.
7545 */
7546static int
7547encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007548 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007549 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007550{
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007552 Py_ssize_t pos = unicode_offset;
7553 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 /* Ideally, we should get reason from FormatMessage. This is the Windows
7555 2000 English version of the message. */
7556 const char *reason = "invalid character";
7557 /* 4=maximum length of a UTF-8 sequence */
7558 char buffer[4];
7559 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7560 Py_ssize_t outsize;
7561 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 PyObject *errorHandler = NULL;
7563 PyObject *exc = NULL;
7564 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007565 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 PyObject *rep;
7568 int ret = -1;
7569
7570 assert(insize > 0);
7571
7572 encoding = code_page_name(code_page, &encoding_obj);
7573 if (encoding == NULL)
7574 return -1;
7575
7576 if (errors == NULL || strcmp(errors, "strict") == 0) {
7577 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7578 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007579 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 if (exc != NULL) {
7581 PyCodec_StrictErrors(exc);
7582 Py_DECREF(exc);
7583 }
7584 Py_XDECREF(encoding_obj);
7585 return -1;
7586 }
7587
7588 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7589 pusedDefaultChar = &usedDefaultChar;
7590 else
7591 pusedDefaultChar = NULL;
7592
7593 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7594 PyErr_NoMemory();
7595 goto error;
7596 }
7597 outsize = insize * Py_ARRAY_LENGTH(buffer);
7598
7599 if (*outbytes == NULL) {
7600 /* Create string object */
7601 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7602 if (*outbytes == NULL)
7603 goto error;
7604 out = PyBytes_AS_STRING(*outbytes);
7605 }
7606 else {
7607 /* Extend string object */
7608 Py_ssize_t n = PyBytes_Size(*outbytes);
7609 if (n > PY_SSIZE_T_MAX - outsize) {
7610 PyErr_NoMemory();
7611 goto error;
7612 }
7613 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7614 goto error;
7615 out = PyBytes_AS_STRING(*outbytes) + n;
7616 }
7617
7618 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007619 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007621 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7622 wchar_t chars[2];
7623 int charsize;
7624 if (ch < 0x10000) {
7625 chars[0] = (wchar_t)ch;
7626 charsize = 1;
7627 }
7628 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007629 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7630 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007631 charsize = 2;
7632 }
7633
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007635 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 buffer, Py_ARRAY_LENGTH(buffer),
7637 NULL, pusedDefaultChar);
7638 if (outsize > 0) {
7639 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7640 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007641 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 memcpy(out, buffer, outsize);
7643 out += outsize;
7644 continue;
7645 }
7646 }
7647 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7648 PyErr_SetFromWindowsErr(0);
7649 goto error;
7650 }
7651
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 rep = unicode_encode_call_errorhandler(
7653 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007654 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007655 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 if (rep == NULL)
7657 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007658 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007659
7660 if (PyBytes_Check(rep)) {
7661 outsize = PyBytes_GET_SIZE(rep);
7662 if (outsize != 1) {
7663 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7664 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7665 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7666 Py_DECREF(rep);
7667 goto error;
7668 }
7669 out = PyBytes_AS_STRING(*outbytes) + offset;
7670 }
7671 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7672 out += outsize;
7673 }
7674 else {
7675 Py_ssize_t i;
7676 enum PyUnicode_Kind kind;
7677 void *data;
7678
Benjamin Petersonbac79492012-01-14 13:34:47 -05007679 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 Py_DECREF(rep);
7681 goto error;
7682 }
7683
7684 outsize = PyUnicode_GET_LENGTH(rep);
7685 if (outsize != 1) {
7686 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7687 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7688 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7689 Py_DECREF(rep);
7690 goto error;
7691 }
7692 out = PyBytes_AS_STRING(*outbytes) + offset;
7693 }
7694 kind = PyUnicode_KIND(rep);
7695 data = PyUnicode_DATA(rep);
7696 for (i=0; i < outsize; i++) {
7697 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7698 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007699 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007700 encoding, unicode,
7701 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 "unable to encode error handler result to ASCII");
7703 Py_DECREF(rep);
7704 goto error;
7705 }
7706 *out = (unsigned char)ch;
7707 out++;
7708 }
7709 }
7710 Py_DECREF(rep);
7711 }
7712 /* write a NUL byte */
7713 *out = 0;
7714 outsize = out - PyBytes_AS_STRING(*outbytes);
7715 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7716 if (_PyBytes_Resize(outbytes, outsize) < 0)
7717 goto error;
7718 ret = 0;
7719
7720error:
7721 Py_XDECREF(encoding_obj);
7722 Py_XDECREF(errorHandler);
7723 Py_XDECREF(exc);
7724 return ret;
7725}
7726
Victor Stinner3a50e702011-10-18 21:21:00 +02007727static PyObject *
7728encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007729 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007730 const char *errors)
7731{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007733 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007734 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007736
Victor Stinner29dacf22015-01-26 16:41:32 +01007737 if (!PyUnicode_Check(unicode)) {
7738 PyErr_BadArgument();
7739 return NULL;
7740 }
7741
Benjamin Petersonbac79492012-01-14 13:34:47 -05007742 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007743 return NULL;
7744 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007745
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 if (code_page < 0) {
7747 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7748 return NULL;
7749 }
7750
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007752 return PyBytes_FromStringAndSize(NULL, 0);
7753
Victor Stinner7581cef2011-11-03 22:32:33 +01007754 offset = 0;
7755 do
7756 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007757#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007758 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007759 chunks. */
7760 if (len > INT_MAX/2) {
7761 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007762 done = 0;
7763 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007765#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007766 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007767 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007768 done = 1;
7769 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007770
Victor Stinner76a31a62011-11-04 00:05:13 +01007771 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007772 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007773 errors);
7774 if (ret == -2)
7775 ret = encode_code_page_errors(code_page, &outbytes,
7776 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007777 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007778 if (ret < 0) {
7779 Py_XDECREF(outbytes);
7780 return NULL;
7781 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007782
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007785 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007786
Victor Stinner3a50e702011-10-18 21:21:00 +02007787 return outbytes;
7788}
7789
7790PyObject *
7791PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7792 Py_ssize_t size,
7793 const char *errors)
7794{
Victor Stinner7581cef2011-11-03 22:32:33 +01007795 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007796 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007797 if (unicode == NULL)
7798 return NULL;
7799 res = encode_code_page(CP_ACP, unicode, errors);
7800 Py_DECREF(unicode);
7801 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007802}
7803
7804PyObject *
7805PyUnicode_EncodeCodePage(int code_page,
7806 PyObject *unicode,
7807 const char *errors)
7808{
Victor Stinner7581cef2011-11-03 22:32:33 +01007809 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007810}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007811
Alexander Belopolsky40018472011-02-26 01:02:56 +00007812PyObject *
7813PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007814{
Victor Stinner7581cef2011-11-03 22:32:33 +01007815 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007816}
7817
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007818#undef NEED_RETRY
7819
Steve Dowercc16be82016-09-08 10:35:16 -07007820#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007821
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822/* --- Character Mapping Codec -------------------------------------------- */
7823
Victor Stinnerfb161b12013-04-18 01:44:27 +02007824static int
7825charmap_decode_string(const char *s,
7826 Py_ssize_t size,
7827 PyObject *mapping,
7828 const char *errors,
7829 _PyUnicodeWriter *writer)
7830{
7831 const char *starts = s;
7832 const char *e;
7833 Py_ssize_t startinpos, endinpos;
7834 PyObject *errorHandler = NULL, *exc = NULL;
7835 Py_ssize_t maplen;
7836 enum PyUnicode_Kind mapkind;
7837 void *mapdata;
7838 Py_UCS4 x;
7839 unsigned char ch;
7840
7841 if (PyUnicode_READY(mapping) == -1)
7842 return -1;
7843
7844 maplen = PyUnicode_GET_LENGTH(mapping);
7845 mapdata = PyUnicode_DATA(mapping);
7846 mapkind = PyUnicode_KIND(mapping);
7847
7848 e = s + size;
7849
7850 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7851 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7852 * is disabled in encoding aliases, latin1 is preferred because
7853 * its implementation is faster. */
7854 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7855 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7856 Py_UCS4 maxchar = writer->maxchar;
7857
7858 assert (writer->kind == PyUnicode_1BYTE_KIND);
7859 while (s < e) {
7860 ch = *s;
7861 x = mapdata_ucs1[ch];
7862 if (x > maxchar) {
7863 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7864 goto onError;
7865 maxchar = writer->maxchar;
7866 outdata = (Py_UCS1 *)writer->data;
7867 }
7868 outdata[writer->pos] = x;
7869 writer->pos++;
7870 ++s;
7871 }
7872 return 0;
7873 }
7874
7875 while (s < e) {
7876 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7877 enum PyUnicode_Kind outkind = writer->kind;
7878 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7879 if (outkind == PyUnicode_1BYTE_KIND) {
7880 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7881 Py_UCS4 maxchar = writer->maxchar;
7882 while (s < e) {
7883 ch = *s;
7884 x = mapdata_ucs2[ch];
7885 if (x > maxchar)
7886 goto Error;
7887 outdata[writer->pos] = x;
7888 writer->pos++;
7889 ++s;
7890 }
7891 break;
7892 }
7893 else if (outkind == PyUnicode_2BYTE_KIND) {
7894 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7895 while (s < e) {
7896 ch = *s;
7897 x = mapdata_ucs2[ch];
7898 if (x == 0xFFFE)
7899 goto Error;
7900 outdata[writer->pos] = x;
7901 writer->pos++;
7902 ++s;
7903 }
7904 break;
7905 }
7906 }
7907 ch = *s;
7908
7909 if (ch < maplen)
7910 x = PyUnicode_READ(mapkind, mapdata, ch);
7911 else
7912 x = 0xfffe; /* invalid value */
7913Error:
7914 if (x == 0xfffe)
7915 {
7916 /* undefined mapping */
7917 startinpos = s-starts;
7918 endinpos = startinpos+1;
7919 if (unicode_decode_call_errorhandler_writer(
7920 errors, &errorHandler,
7921 "charmap", "character maps to <undefined>",
7922 &starts, &e, &startinpos, &endinpos, &exc, &s,
7923 writer)) {
7924 goto onError;
7925 }
7926 continue;
7927 }
7928
7929 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7930 goto onError;
7931 ++s;
7932 }
7933 Py_XDECREF(errorHandler);
7934 Py_XDECREF(exc);
7935 return 0;
7936
7937onError:
7938 Py_XDECREF(errorHandler);
7939 Py_XDECREF(exc);
7940 return -1;
7941}
7942
7943static int
7944charmap_decode_mapping(const char *s,
7945 Py_ssize_t size,
7946 PyObject *mapping,
7947 const char *errors,
7948 _PyUnicodeWriter *writer)
7949{
7950 const char *starts = s;
7951 const char *e;
7952 Py_ssize_t startinpos, endinpos;
7953 PyObject *errorHandler = NULL, *exc = NULL;
7954 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007955 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007956
7957 e = s + size;
7958
7959 while (s < e) {
7960 ch = *s;
7961
7962 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7963 key = PyLong_FromLong((long)ch);
7964 if (key == NULL)
7965 goto onError;
7966
7967 item = PyObject_GetItem(mapping, key);
7968 Py_DECREF(key);
7969 if (item == NULL) {
7970 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7971 /* No mapping found means: mapping is undefined. */
7972 PyErr_Clear();
7973 goto Undefined;
7974 } else
7975 goto onError;
7976 }
7977
7978 /* Apply mapping */
7979 if (item == Py_None)
7980 goto Undefined;
7981 if (PyLong_Check(item)) {
7982 long value = PyLong_AS_LONG(item);
7983 if (value == 0xFFFE)
7984 goto Undefined;
7985 if (value < 0 || value > MAX_UNICODE) {
7986 PyErr_Format(PyExc_TypeError,
7987 "character mapping must be in range(0x%lx)",
7988 (unsigned long)MAX_UNICODE + 1);
7989 goto onError;
7990 }
7991
7992 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7993 goto onError;
7994 }
7995 else if (PyUnicode_Check(item)) {
7996 if (PyUnicode_READY(item) == -1)
7997 goto onError;
7998 if (PyUnicode_GET_LENGTH(item) == 1) {
7999 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8000 if (value == 0xFFFE)
8001 goto Undefined;
8002 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8003 goto onError;
8004 }
8005 else {
8006 writer->overallocate = 1;
8007 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8008 goto onError;
8009 }
8010 }
8011 else {
8012 /* wrong return value */
8013 PyErr_SetString(PyExc_TypeError,
8014 "character mapping must return integer, None or str");
8015 goto onError;
8016 }
8017 Py_CLEAR(item);
8018 ++s;
8019 continue;
8020
8021Undefined:
8022 /* undefined mapping */
8023 Py_CLEAR(item);
8024 startinpos = s-starts;
8025 endinpos = startinpos+1;
8026 if (unicode_decode_call_errorhandler_writer(
8027 errors, &errorHandler,
8028 "charmap", "character maps to <undefined>",
8029 &starts, &e, &startinpos, &endinpos, &exc, &s,
8030 writer)) {
8031 goto onError;
8032 }
8033 }
8034 Py_XDECREF(errorHandler);
8035 Py_XDECREF(exc);
8036 return 0;
8037
8038onError:
8039 Py_XDECREF(item);
8040 Py_XDECREF(errorHandler);
8041 Py_XDECREF(exc);
8042 return -1;
8043}
8044
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045PyObject *
8046PyUnicode_DecodeCharmap(const char *s,
8047 Py_ssize_t size,
8048 PyObject *mapping,
8049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008051 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008052
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 /* Default to Latin-1 */
8054 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008058 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008059 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008060 writer.min_length = size;
8061 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008063
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008064 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008065 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8066 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008067 }
8068 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008069 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8070 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008072 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008073
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008075 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 return NULL;
8077}
8078
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079/* Charmap encoding: the lookup table */
8080
Alexander Belopolsky40018472011-02-26 01:02:56 +00008081struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 PyObject_HEAD
8083 unsigned char level1[32];
8084 int count2, count3;
8085 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086};
8087
8088static PyObject*
8089encoding_map_size(PyObject *obj, PyObject* args)
8090{
8091 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094}
8095
8096static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008097 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 PyDoc_STR("Return the size (in bytes) of this object") },
8099 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100};
8101
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008103 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008104 "EncodingMap", /*tp_name*/
8105 sizeof(struct encoding_map), /*tp_basicsize*/
8106 0, /*tp_itemsize*/
8107 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008108 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008109 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 0, /*tp_getattr*/
8111 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008112 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008113 0, /*tp_repr*/
8114 0, /*tp_as_number*/
8115 0, /*tp_as_sequence*/
8116 0, /*tp_as_mapping*/
8117 0, /*tp_hash*/
8118 0, /*tp_call*/
8119 0, /*tp_str*/
8120 0, /*tp_getattro*/
8121 0, /*tp_setattro*/
8122 0, /*tp_as_buffer*/
8123 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8124 0, /*tp_doc*/
8125 0, /*tp_traverse*/
8126 0, /*tp_clear*/
8127 0, /*tp_richcompare*/
8128 0, /*tp_weaklistoffset*/
8129 0, /*tp_iter*/
8130 0, /*tp_iternext*/
8131 encoding_map_methods, /*tp_methods*/
8132 0, /*tp_members*/
8133 0, /*tp_getset*/
8134 0, /*tp_base*/
8135 0, /*tp_dict*/
8136 0, /*tp_descr_get*/
8137 0, /*tp_descr_set*/
8138 0, /*tp_dictoffset*/
8139 0, /*tp_init*/
8140 0, /*tp_alloc*/
8141 0, /*tp_new*/
8142 0, /*tp_free*/
8143 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008144};
8145
8146PyObject*
8147PyUnicode_BuildEncodingMap(PyObject* string)
8148{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008149 PyObject *result;
8150 struct encoding_map *mresult;
8151 int i;
8152 int need_dict = 0;
8153 unsigned char level1[32];
8154 unsigned char level2[512];
8155 unsigned char *mlevel1, *mlevel2, *mlevel3;
8156 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008157 int kind;
8158 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008159 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008160 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008161
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008162 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 PyErr_BadArgument();
8164 return NULL;
8165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 kind = PyUnicode_KIND(string);
8167 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008168 length = PyUnicode_GET_LENGTH(string);
8169 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008170 memset(level1, 0xFF, sizeof level1);
8171 memset(level2, 0xFF, sizeof level2);
8172
8173 /* If there isn't a one-to-one mapping of NULL to \0,
8174 or if there are non-BMP characters, we need to use
8175 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008176 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008178 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008179 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 ch = PyUnicode_READ(kind, data, i);
8181 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008182 need_dict = 1;
8183 break;
8184 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008185 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008186 /* unmapped character */
8187 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 l1 = ch >> 11;
8189 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008190 if (level1[l1] == 0xFF)
8191 level1[l1] = count2++;
8192 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008193 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008194 }
8195
8196 if (count2 >= 0xFF || count3 >= 0xFF)
8197 need_dict = 1;
8198
8199 if (need_dict) {
8200 PyObject *result = PyDict_New();
8201 PyObject *key, *value;
8202 if (!result)
8203 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008204 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008205 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008206 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008207 if (!key || !value)
8208 goto failed1;
8209 if (PyDict_SetItem(result, key, value) == -1)
8210 goto failed1;
8211 Py_DECREF(key);
8212 Py_DECREF(value);
8213 }
8214 return result;
8215 failed1:
8216 Py_XDECREF(key);
8217 Py_XDECREF(value);
8218 Py_DECREF(result);
8219 return NULL;
8220 }
8221
8222 /* Create a three-level trie */
8223 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8224 16*count2 + 128*count3 - 1);
8225 if (!result)
8226 return PyErr_NoMemory();
8227 PyObject_Init(result, &EncodingMapType);
8228 mresult = (struct encoding_map*)result;
8229 mresult->count2 = count2;
8230 mresult->count3 = count3;
8231 mlevel1 = mresult->level1;
8232 mlevel2 = mresult->level23;
8233 mlevel3 = mresult->level23 + 16*count2;
8234 memcpy(mlevel1, level1, 32);
8235 memset(mlevel2, 0xFF, 16*count2);
8236 memset(mlevel3, 0, 128*count3);
8237 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008238 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008239 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008240 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8241 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008242 /* unmapped character */
8243 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008244 o1 = ch>>11;
8245 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246 i2 = 16*mlevel1[o1] + o2;
8247 if (mlevel2[i2] == 0xFF)
8248 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008249 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250 i3 = 128*mlevel2[i2] + o3;
8251 mlevel3[i3] = i;
8252 }
8253 return result;
8254}
8255
8256static int
Victor Stinner22168992011-11-20 17:09:18 +01008257encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008258{
8259 struct encoding_map *map = (struct encoding_map*)mapping;
8260 int l1 = c>>11;
8261 int l2 = (c>>7) & 0xF;
8262 int l3 = c & 0x7F;
8263 int i;
8264
Victor Stinner22168992011-11-20 17:09:18 +01008265 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008266 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008267 if (c == 0)
8268 return 0;
8269 /* level 1*/
8270 i = map->level1[l1];
8271 if (i == 0xFF) {
8272 return -1;
8273 }
8274 /* level 2*/
8275 i = map->level23[16*i+l2];
8276 if (i == 0xFF) {
8277 return -1;
8278 }
8279 /* level 3 */
8280 i = map->level23[16*map->count2 + 128*i + l3];
8281 if (i == 0) {
8282 return -1;
8283 }
8284 return i;
8285}
8286
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008287/* Lookup the character ch in the mapping. If the character
8288 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008289 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008290static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008291charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008292{
Christian Heimes217cfd12007-12-02 14:31:20 +00008293 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008294 PyObject *x;
8295
8296 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008297 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 x = PyObject_GetItem(mapping, w);
8299 Py_DECREF(w);
8300 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8302 /* No mapping found means: mapping is undefined. */
8303 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008304 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 } else
8306 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008307 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008308 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008310 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 long value = PyLong_AS_LONG(x);
8312 if (value < 0 || value > 255) {
8313 PyErr_SetString(PyExc_TypeError,
8314 "character mapping must be in range(256)");
8315 Py_DECREF(x);
8316 return NULL;
8317 }
8318 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008319 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008320 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008321 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008322 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 /* wrong return value */
8324 PyErr_Format(PyExc_TypeError,
8325 "character mapping must return integer, bytes or None, not %.400s",
8326 x->ob_type->tp_name);
8327 Py_DECREF(x);
8328 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008329 }
8330}
8331
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008332static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008333charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008334{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008335 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8336 /* exponentially overallocate to minimize reallocations */
8337 if (requiredsize < 2*outsize)
8338 requiredsize = 2*outsize;
8339 if (_PyBytes_Resize(outobj, requiredsize))
8340 return -1;
8341 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342}
8343
Benjamin Peterson14339b62009-01-31 16:36:08 +00008344typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008345 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008346} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008347/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008348 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008349 space is available. Return a new reference to the object that
8350 was put in the output buffer, or Py_None, if the mapping was undefined
8351 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008352 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008353static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008354charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008355 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008356{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008357 PyObject *rep;
8358 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008359 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360
Christian Heimes90aa7642007-12-19 02:45:37 +00008361 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008362 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008363 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008364 if (res == -1)
8365 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008366 if (outsize<requiredsize)
8367 if (charmapencode_resize(outobj, outpos, requiredsize))
8368 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008369 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 outstart[(*outpos)++] = (char)res;
8371 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 }
8373
8374 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008375 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008377 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 Py_DECREF(rep);
8379 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008381 if (PyLong_Check(rep)) {
8382 Py_ssize_t requiredsize = *outpos+1;
8383 if (outsize<requiredsize)
8384 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8385 Py_DECREF(rep);
8386 return enc_EXCEPTION;
8387 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008388 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008390 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008391 else {
8392 const char *repchars = PyBytes_AS_STRING(rep);
8393 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8394 Py_ssize_t requiredsize = *outpos+repsize;
8395 if (outsize<requiredsize)
8396 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8397 Py_DECREF(rep);
8398 return enc_EXCEPTION;
8399 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008400 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008401 memcpy(outstart + *outpos, repchars, repsize);
8402 *outpos += repsize;
8403 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008404 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008405 Py_DECREF(rep);
8406 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008407}
8408
8409/* handle an error in PyUnicode_EncodeCharmap
8410 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008411static int
8412charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008413 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008414 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008415 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008416 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008417{
8418 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008420 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008421 enum PyUnicode_Kind kind;
8422 void *data;
8423 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008424 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008425 Py_ssize_t collstartpos = *inpos;
8426 Py_ssize_t collendpos = *inpos+1;
8427 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008428 const char *encoding = "charmap";
8429 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008430 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008431 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008432 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008433
Benjamin Petersonbac79492012-01-14 13:34:47 -05008434 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008435 return -1;
8436 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437 /* find all unencodable characters */
8438 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008439 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008440 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008442 val = encoding_map_lookup(ch, mapping);
8443 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008444 break;
8445 ++collendpos;
8446 continue;
8447 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008448
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008449 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8450 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 if (rep==NULL)
8452 return -1;
8453 else if (rep!=Py_None) {
8454 Py_DECREF(rep);
8455 break;
8456 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008457 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008459 }
8460 /* cache callback name lookup
8461 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008462 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008463 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008464
8465 switch (*error_handler) {
8466 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008467 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008469
8470 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 x = charmapencode_output('?', mapping, res, respos);
8473 if (x==enc_EXCEPTION) {
8474 return -1;
8475 }
8476 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008477 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 return -1;
8479 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008480 }
8481 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008482 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008483 *inpos = collendpos;
8484 break;
Victor Stinner50149202015-09-22 00:26:54 +02008485
8486 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 /* generate replacement (temporarily (mis)uses p) */
8488 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008489 char buffer[2+29+1+1];
8490 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008491 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 for (cp = buffer; *cp; ++cp) {
8493 x = charmapencode_output(*cp, mapping, res, respos);
8494 if (x==enc_EXCEPTION)
8495 return -1;
8496 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008497 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 return -1;
8499 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008500 }
8501 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008502 *inpos = collendpos;
8503 break;
Victor Stinner50149202015-09-22 00:26:54 +02008504
Benjamin Peterson14339b62009-01-31 16:36:08 +00008505 default:
Victor Stinner50149202015-09-22 00:26:54 +02008506 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008507 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008508 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008510 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008511 if (PyBytes_Check(repunicode)) {
8512 /* Directly copy bytes result to output. */
8513 Py_ssize_t outsize = PyBytes_Size(*res);
8514 Py_ssize_t requiredsize;
8515 repsize = PyBytes_Size(repunicode);
8516 requiredsize = *respos + repsize;
8517 if (requiredsize > outsize)
8518 /* Make room for all additional bytes. */
8519 if (charmapencode_resize(res, respos, requiredsize)) {
8520 Py_DECREF(repunicode);
8521 return -1;
8522 }
8523 memcpy(PyBytes_AsString(*res) + *respos,
8524 PyBytes_AsString(repunicode), repsize);
8525 *respos += repsize;
8526 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008527 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008528 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008529 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008530 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008531 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008532 Py_DECREF(repunicode);
8533 return -1;
8534 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008535 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008536 data = PyUnicode_DATA(repunicode);
8537 kind = PyUnicode_KIND(repunicode);
8538 for (index = 0; index < repsize; index++) {
8539 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8540 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008541 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008542 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008543 return -1;
8544 }
8545 else if (x==enc_FAILED) {
8546 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008547 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008548 return -1;
8549 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008550 }
8551 *inpos = newpos;
8552 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008553 }
8554 return 0;
8555}
8556
Alexander Belopolsky40018472011-02-26 01:02:56 +00008557PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008558_PyUnicode_EncodeCharmap(PyObject *unicode,
8559 PyObject *mapping,
8560 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008561{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008562 /* output object */
8563 PyObject *res = NULL;
8564 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008565 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008566 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008568 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008569 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008571 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008572 void *data;
8573 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008574
Benjamin Petersonbac79492012-01-14 13:34:47 -05008575 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008576 return NULL;
8577 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008578 data = PyUnicode_DATA(unicode);
8579 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580
Guido van Rossumd57fd912000-03-10 22:53:23 +00008581 /* Default to Latin-1 */
8582 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008583 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008584
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008585 /* allocate enough for a simple encoding without
8586 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008587 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008588 if (res == NULL)
8589 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008590 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008591 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008594 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008596 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 if (x==enc_EXCEPTION) /* error */
8598 goto onError;
8599 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008600 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008602 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 &res, &respos)) {
8604 goto onError;
8605 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008606 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 else
8608 /* done with this character => adjust input position */
8609 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008610 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008611
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008612 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008613 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008614 if (_PyBytes_Resize(&res, respos) < 0)
8615 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008616
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008617 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008618 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008619 return res;
8620
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008622 Py_XDECREF(res);
8623 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008624 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625 return NULL;
8626}
8627
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008628/* Deprecated */
8629PyObject *
8630PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8631 Py_ssize_t size,
8632 PyObject *mapping,
8633 const char *errors)
8634{
8635 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008636 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008637 if (unicode == NULL)
8638 return NULL;
8639 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8640 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008641 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008642}
8643
Alexander Belopolsky40018472011-02-26 01:02:56 +00008644PyObject *
8645PyUnicode_AsCharmapString(PyObject *unicode,
8646 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008647{
8648 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008649 PyErr_BadArgument();
8650 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008652 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653}
8654
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008655/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008656static void
8657make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008658 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008659 Py_ssize_t startpos, Py_ssize_t endpos,
8660 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008662 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008663 *exceptionObject = _PyUnicodeTranslateError_Create(
8664 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 }
8666 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8668 goto onError;
8669 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8670 goto onError;
8671 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8672 goto onError;
8673 return;
8674 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008675 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008676 }
8677}
8678
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008679/* error handling callback helper:
8680 build arguments, call the callback and check the arguments,
8681 put the result into newpos and return the replacement string, which
8682 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008683static PyObject *
8684unicode_translate_call_errorhandler(const char *errors,
8685 PyObject **errorHandler,
8686 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008687 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008688 Py_ssize_t startpos, Py_ssize_t endpos,
8689 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008690{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008691 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008692
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008693 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 PyObject *restuple;
8695 PyObject *resunicode;
8696
8697 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008698 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008699 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008700 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008701 }
8702
8703 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008704 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008708 restuple = PyObject_CallFunctionObjArgs(
8709 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008710 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008711 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008713 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 Py_DECREF(restuple);
8715 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008717 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 &resunicode, &i_newpos)) {
8719 Py_DECREF(restuple);
8720 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008722 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008723 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008724 else
8725 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008726 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008727 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 Py_DECREF(restuple);
8729 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008730 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008731 Py_INCREF(resunicode);
8732 Py_DECREF(restuple);
8733 return resunicode;
8734}
8735
8736/* Lookup the character ch in the mapping and put the result in result,
8737 which must be decrefed by the caller.
8738 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008739static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741{
Christian Heimes217cfd12007-12-02 14:31:20 +00008742 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743 PyObject *x;
8744
8745 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008746 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747 x = PyObject_GetItem(mapping, w);
8748 Py_DECREF(w);
8749 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8751 /* No mapping found means: use 1:1 mapping. */
8752 PyErr_Clear();
8753 *result = NULL;
8754 return 0;
8755 } else
8756 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 }
8758 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008759 *result = x;
8760 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008762 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008764 if (value < 0 || value > MAX_UNICODE) {
8765 PyErr_Format(PyExc_ValueError,
8766 "character mapping must be in range(0x%x)",
8767 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008768 Py_DECREF(x);
8769 return -1;
8770 }
8771 *result = x;
8772 return 0;
8773 }
8774 else if (PyUnicode_Check(x)) {
8775 *result = x;
8776 return 0;
8777 }
8778 else {
8779 /* wrong return value */
8780 PyErr_SetString(PyExc_TypeError,
8781 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008782 Py_DECREF(x);
8783 return -1;
8784 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008785}
Victor Stinner1194ea02014-04-04 19:37:40 +02008786
8787/* lookup the character, write the result into the writer.
8788 Return 1 if the result was written into the writer, return 0 if the mapping
8789 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008790static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008791charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8792 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793{
Victor Stinner1194ea02014-04-04 19:37:40 +02008794 PyObject *item;
8795
8796 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008797 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008798
8799 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008800 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008801 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008802 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008804 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008805 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008806
8807 if (item == Py_None) {
8808 Py_DECREF(item);
8809 return 0;
8810 }
8811
8812 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008813 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8814 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8815 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008816 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8817 Py_DECREF(item);
8818 return -1;
8819 }
8820 Py_DECREF(item);
8821 return 1;
8822 }
8823
8824 if (!PyUnicode_Check(item)) {
8825 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008826 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008827 }
8828
8829 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8830 Py_DECREF(item);
8831 return -1;
8832 }
8833
8834 Py_DECREF(item);
8835 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008836}
8837
Victor Stinner89a76ab2014-04-05 11:44:04 +02008838static int
8839unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8840 Py_UCS1 *translate)
8841{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008842 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008843 int ret = 0;
8844
Victor Stinner89a76ab2014-04-05 11:44:04 +02008845 if (charmaptranslate_lookup(ch, mapping, &item)) {
8846 return -1;
8847 }
8848
8849 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008850 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008851 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008853 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008854 /* not found => default to 1:1 mapping */
8855 translate[ch] = ch;
8856 return 1;
8857 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008858 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008859 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008860 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8861 used it */
8862 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008863 /* invalid character or character outside ASCII:
8864 skip the fast translate */
8865 goto exit;
8866 }
8867 translate[ch] = (Py_UCS1)replace;
8868 }
8869 else if (PyUnicode_Check(item)) {
8870 Py_UCS4 replace;
8871
8872 if (PyUnicode_READY(item) == -1) {
8873 Py_DECREF(item);
8874 return -1;
8875 }
8876 if (PyUnicode_GET_LENGTH(item) != 1)
8877 goto exit;
8878
8879 replace = PyUnicode_READ_CHAR(item, 0);
8880 if (replace > 127)
8881 goto exit;
8882 translate[ch] = (Py_UCS1)replace;
8883 }
8884 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008885 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008886 goto exit;
8887 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008888 ret = 1;
8889
Benjamin Peterson1365de72014-04-07 20:15:41 -04008890 exit:
8891 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 return ret;
8893}
8894
8895/* Fast path for ascii => ascii translation. Return 1 if the whole string
8896 was translated into writer, return 0 if the input string was partially
8897 translated into writer, raise an exception and return -1 on error. */
8898static int
8899unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008900 _PyUnicodeWriter *writer, int ignore,
8901 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902{
Victor Stinner872b2912014-04-05 14:27:07 +02008903 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008904 Py_ssize_t len;
8905 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008906 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008907
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908 len = PyUnicode_GET_LENGTH(input);
8909
Victor Stinner872b2912014-04-05 14:27:07 +02008910 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008911
8912 in = PyUnicode_1BYTE_DATA(input);
8913 end = in + len;
8914
8915 assert(PyUnicode_IS_ASCII(writer->buffer));
8916 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8917 out = PyUnicode_1BYTE_DATA(writer->buffer);
8918
Victor Stinner872b2912014-04-05 14:27:07 +02008919 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008920 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008921 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008923 int translate = unicode_fast_translate_lookup(mapping, ch,
8924 ascii_table);
8925 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008927 if (translate == 0)
8928 goto exit;
8929 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008930 }
Victor Stinner872b2912014-04-05 14:27:07 +02008931 if (ch2 == 0xfe) {
8932 if (ignore)
8933 continue;
8934 goto exit;
8935 }
8936 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008937 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008938 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008939 }
Victor Stinner872b2912014-04-05 14:27:07 +02008940 res = 1;
8941
8942exit:
8943 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008944 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008945 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008946}
8947
Victor Stinner3222da22015-10-01 22:07:32 +02008948static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008949_PyUnicode_TranslateCharmap(PyObject *input,
8950 PyObject *mapping,
8951 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008952{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008954 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955 Py_ssize_t size, i;
8956 int kind;
8957 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 _PyUnicodeWriter writer;
8959 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008960 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008961 PyObject *errorHandler = NULL;
8962 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008963 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008964 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008967 PyErr_BadArgument();
8968 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008969 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008971 if (PyUnicode_READY(input) == -1)
8972 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008973 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008974 kind = PyUnicode_KIND(input);
8975 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008976
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008977 if (size == 0)
8978 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008980 /* allocate enough for a simple 1:1 translation without
8981 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008982 _PyUnicodeWriter_Init(&writer);
8983 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008984 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008985
Victor Stinner872b2912014-04-05 14:27:07 +02008986 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8987
Victor Stinner33798672016-03-01 21:59:58 +01008988 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008989 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008990 if (PyUnicode_IS_ASCII(input)) {
8991 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8992 if (res < 0) {
8993 _PyUnicodeWriter_Dealloc(&writer);
8994 return NULL;
8995 }
8996 if (res == 1)
8997 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008998 }
Victor Stinner33798672016-03-01 21:59:58 +01008999 else {
9000 i = 0;
9001 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009002
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009003 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009004 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009005 int translate;
9006 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9007 Py_ssize_t newpos;
9008 /* startpos for collecting untranslatable chars */
9009 Py_ssize_t collstart;
9010 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009011 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009012
Victor Stinner1194ea02014-04-04 19:37:40 +02009013 ch = PyUnicode_READ(kind, data, i);
9014 translate = charmaptranslate_output(ch, mapping, &writer);
9015 if (translate < 0)
9016 goto onError;
9017
9018 if (translate != 0) {
9019 /* it worked => adjust input pointer */
9020 ++i;
9021 continue;
9022 }
9023
9024 /* untranslatable character */
9025 collstart = i;
9026 collend = i+1;
9027
9028 /* find all untranslatable characters */
9029 while (collend < size) {
9030 PyObject *x;
9031 ch = PyUnicode_READ(kind, data, collend);
9032 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009033 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009034 Py_XDECREF(x);
9035 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009036 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009037 ++collend;
9038 }
9039
9040 if (ignore) {
9041 i = collend;
9042 }
9043 else {
9044 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9045 reason, input, &exc,
9046 collstart, collend, &newpos);
9047 if (repunicode == NULL)
9048 goto onError;
9049 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009051 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009052 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009053 Py_DECREF(repunicode);
9054 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009055 }
9056 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009057 Py_XDECREF(exc);
9058 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009060
Benjamin Peterson29060642009-01-31 22:14:21 +00009061 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009062 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009063 Py_XDECREF(exc);
9064 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009065 return NULL;
9066}
9067
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009068/* Deprecated. Use PyUnicode_Translate instead. */
9069PyObject *
9070PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9071 Py_ssize_t size,
9072 PyObject *mapping,
9073 const char *errors)
9074{
Christian Heimes5f520f42012-09-11 14:03:25 +02009075 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009076 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009077 if (!unicode)
9078 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009079 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9080 Py_DECREF(unicode);
9081 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082}
9083
Alexander Belopolsky40018472011-02-26 01:02:56 +00009084PyObject *
9085PyUnicode_Translate(PyObject *str,
9086 PyObject *mapping,
9087 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009088{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009089 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009090 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009091 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092}
Tim Petersced69f82003-09-16 20:30:58 +00009093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094PyObject *
9095_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9096{
9097 if (!PyUnicode_Check(unicode)) {
9098 PyErr_BadInternalCall();
9099 return NULL;
9100 }
9101 if (PyUnicode_READY(unicode) == -1)
9102 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009103 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009104 /* If the string is already ASCII, just return the same string */
9105 Py_INCREF(unicode);
9106 return unicode;
9107 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009108
9109 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9110 PyObject *result = PyUnicode_New(len, 127);
9111 if (result == NULL) {
9112 return NULL;
9113 }
9114
9115 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9116 int kind = PyUnicode_KIND(unicode);
9117 const void *data = PyUnicode_DATA(unicode);
9118 Py_ssize_t i;
9119 for (i = 0; i < len; ++i) {
9120 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9121 if (ch < 127) {
9122 out[i] = ch;
9123 }
9124 else if (Py_UNICODE_ISSPACE(ch)) {
9125 out[i] = ' ';
9126 }
9127 else {
9128 int decimal = Py_UNICODE_TODECIMAL(ch);
9129 if (decimal < 0) {
9130 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009131 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009132 _PyUnicode_LENGTH(result) = i + 1;
9133 break;
9134 }
9135 out[i] = '0' + decimal;
9136 }
9137 }
9138
INADA Naoki16dfca42018-07-14 12:06:43 +09009139 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009140 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009141}
9142
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009143PyObject *
9144PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9145 Py_ssize_t length)
9146{
Victor Stinnerf0124502011-11-21 23:12:56 +01009147 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009148 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009149 Py_UCS4 maxchar;
9150 enum PyUnicode_Kind kind;
9151 void *data;
9152
Victor Stinner99d7ad02012-02-22 13:37:39 +01009153 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009154 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009155 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009156 if (ch > 127) {
9157 int decimal = Py_UNICODE_TODECIMAL(ch);
9158 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009159 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009160 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009161 }
9162 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009163
9164 /* Copy to a new string */
9165 decimal = PyUnicode_New(length, maxchar);
9166 if (decimal == NULL)
9167 return decimal;
9168 kind = PyUnicode_KIND(decimal);
9169 data = PyUnicode_DATA(decimal);
9170 /* Iterate over code points */
9171 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009172 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009173 if (ch > 127) {
9174 int decimal = Py_UNICODE_TODECIMAL(ch);
9175 if (decimal >= 0)
9176 ch = '0' + decimal;
9177 }
9178 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009179 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009180 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009181}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009182/* --- Decimal Encoder ---------------------------------------------------- */
9183
Alexander Belopolsky40018472011-02-26 01:02:56 +00009184int
9185PyUnicode_EncodeDecimal(Py_UNICODE *s,
9186 Py_ssize_t length,
9187 char *output,
9188 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009189{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009190 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009191 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009192 enum PyUnicode_Kind kind;
9193 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009194
9195 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009196 PyErr_BadArgument();
9197 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009198 }
9199
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009200 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009201 if (unicode == NULL)
9202 return -1;
9203
Victor Stinner42bf7752011-11-21 22:52:58 +01009204 kind = PyUnicode_KIND(unicode);
9205 data = PyUnicode_DATA(unicode);
9206
Victor Stinnerb84d7232011-11-22 01:50:07 +01009207 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009208 PyObject *exc;
9209 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009211 Py_ssize_t startpos;
9212
9213 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009214
Benjamin Peterson29060642009-01-31 22:14:21 +00009215 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009216 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009217 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009219 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009220 decimal = Py_UNICODE_TODECIMAL(ch);
9221 if (decimal >= 0) {
9222 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009223 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 continue;
9225 }
9226 if (0 < ch && ch < 256) {
9227 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009228 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 continue;
9230 }
Victor Stinner6345be92011-11-25 20:09:01 +01009231
Victor Stinner42bf7752011-11-21 22:52:58 +01009232 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009233 exc = NULL;
9234 raise_encode_exception(&exc, "decimal", unicode,
9235 startpos, startpos+1,
9236 "invalid decimal Unicode string");
9237 Py_XDECREF(exc);
9238 Py_DECREF(unicode);
9239 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009240 }
9241 /* 0-terminate the output string */
9242 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009243 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009244 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009245}
9246
Guido van Rossumd57fd912000-03-10 22:53:23 +00009247/* --- Helpers ------------------------------------------------------------ */
9248
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009249/* helper macro to fixup start/end slice values */
9250#define ADJUST_INDICES(start, end, len) \
9251 if (end > len) \
9252 end = len; \
9253 else if (end < 0) { \
9254 end += len; \
9255 if (end < 0) \
9256 end = 0; \
9257 } \
9258 if (start < 0) { \
9259 start += len; \
9260 if (start < 0) \
9261 start = 0; \
9262 }
9263
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009265any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009266 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009267 Py_ssize_t end,
9268 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009269{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009270 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009271 void *buf1, *buf2;
9272 Py_ssize_t len1, len2, result;
9273
9274 kind1 = PyUnicode_KIND(s1);
9275 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009276 if (kind1 < kind2)
9277 return -1;
9278
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 len1 = PyUnicode_GET_LENGTH(s1);
9280 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009281 ADJUST_INDICES(start, end, len1);
9282 if (end - start < len2)
9283 return -1;
9284
9285 buf1 = PyUnicode_DATA(s1);
9286 buf2 = PyUnicode_DATA(s2);
9287 if (len2 == 1) {
9288 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9289 result = findchar((const char *)buf1 + kind1*start,
9290 kind1, end - start, ch, direction);
9291 if (result == -1)
9292 return -1;
9293 else
9294 return start + result;
9295 }
9296
9297 if (kind2 != kind1) {
9298 buf2 = _PyUnicode_AsKind(s2, kind1);
9299 if (!buf2)
9300 return -2;
9301 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302
Victor Stinner794d5672011-10-10 03:21:36 +02009303 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009304 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009305 case PyUnicode_1BYTE_KIND:
9306 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9307 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9308 else
9309 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9310 break;
9311 case PyUnicode_2BYTE_KIND:
9312 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9313 break;
9314 case PyUnicode_4BYTE_KIND:
9315 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9316 break;
9317 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009318 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009319 }
9320 }
9321 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009322 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009323 case PyUnicode_1BYTE_KIND:
9324 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9325 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9326 else
9327 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9328 break;
9329 case PyUnicode_2BYTE_KIND:
9330 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9331 break;
9332 case PyUnicode_4BYTE_KIND:
9333 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9334 break;
9335 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009336 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009337 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009338 }
9339
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009340 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009341 PyMem_Free(buf2);
9342
9343 return result;
9344}
9345
Victor Stinner59423e32018-11-26 13:40:01 +01009346/* _PyUnicode_InsertThousandsGrouping() helper functions */
9347#include "stringlib/localeutil.h"
9348
9349/**
9350 * InsertThousandsGrouping:
9351 * @writer: Unicode writer.
9352 * @n_buffer: Number of characters in @buffer.
9353 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9354 * @d_pos: Start of digits string.
9355 * @n_digits: The number of digits in the string, in which we want
9356 * to put the grouping chars.
9357 * @min_width: The minimum width of the digits in the output string.
9358 * Output will be zero-padded on the left to fill.
9359 * @grouping: see definition in localeconv().
9360 * @thousands_sep: see definition in localeconv().
9361 *
9362 * There are 2 modes: counting and filling. If @writer is NULL,
9363 * we are in counting mode, else filling mode.
9364 * If counting, the required buffer size is returned.
9365 * If filling, we know the buffer will be large enough, so we don't
9366 * need to pass in the buffer size.
9367 * Inserts thousand grouping characters (as defined by grouping and
9368 * thousands_sep) into @writer.
9369 *
9370 * Return value: -1 on error, number of characters otherwise.
9371 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009372Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009373_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009374 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009375 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009376 PyObject *digits,
9377 Py_ssize_t d_pos,
9378 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009380 const char *grouping,
9381 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009382 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009383{
Xtreak3f7983a2019-01-07 20:39:14 +05309384 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009385 if (writer) {
9386 assert(digits != NULL);
9387 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009388 }
9389 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009390 assert(digits == NULL);
9391 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009392 }
Victor Stinner59423e32018-11-26 13:40:01 +01009393 assert(0 <= d_pos);
9394 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009395 assert(grouping != NULL);
9396
9397 if (digits != NULL) {
9398 if (PyUnicode_READY(digits) == -1) {
9399 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009400 }
Victor Stinner59423e32018-11-26 13:40:01 +01009401 }
9402 if (PyUnicode_READY(thousands_sep) == -1) {
9403 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009404 }
9405
Victor Stinner59423e32018-11-26 13:40:01 +01009406 Py_ssize_t count = 0;
9407 Py_ssize_t n_zeros;
9408 int loop_broken = 0;
9409 int use_separator = 0; /* First time through, don't append the
9410 separator. They only go between
9411 groups. */
9412 Py_ssize_t buffer_pos;
9413 Py_ssize_t digits_pos;
9414 Py_ssize_t len;
9415 Py_ssize_t n_chars;
9416 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9417 be looked at */
9418 /* A generator that returns all of the grouping widths, until it
9419 returns 0. */
9420 GroupGenerator groupgen;
9421 GroupGenerator_init(&groupgen, grouping);
9422 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9423
9424 /* if digits are not grouped, thousands separator
9425 should be an empty string */
9426 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9427
9428 digits_pos = d_pos + n_digits;
9429 if (writer) {
9430 buffer_pos = writer->pos + n_buffer;
9431 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9432 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009433 }
Victor Stinner59423e32018-11-26 13:40:01 +01009434 else {
9435 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009436 }
Victor Stinner59423e32018-11-26 13:40:01 +01009437
9438 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009439 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009440 }
Victor Stinner59423e32018-11-26 13:40:01 +01009441
9442 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9443 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9444 n_zeros = Py_MAX(0, len - remaining);
9445 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9446
9447 /* Use n_zero zero's and n_chars chars */
9448
9449 /* Count only, don't do anything. */
9450 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9451
9452 /* Copy into the writer. */
9453 InsertThousandsGrouping_fill(writer, &buffer_pos,
9454 digits, &digits_pos,
9455 n_chars, n_zeros,
9456 use_separator ? thousands_sep : NULL,
9457 thousands_sep_len, maxchar);
9458
9459 /* Use a separator next time. */
9460 use_separator = 1;
9461
9462 remaining -= n_chars;
9463 min_width -= len;
9464
9465 if (remaining <= 0 && min_width <= 0) {
9466 loop_broken = 1;
9467 break;
9468 }
9469 min_width -= thousands_sep_len;
9470 }
9471 if (!loop_broken) {
9472 /* We left the loop without using a break statement. */
9473
9474 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9475 n_zeros = Py_MAX(0, len - remaining);
9476 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9477
9478 /* Use n_zero zero's and n_chars chars */
9479 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9480
9481 /* Copy into the writer. */
9482 InsertThousandsGrouping_fill(writer, &buffer_pos,
9483 digits, &digits_pos,
9484 n_chars, n_zeros,
9485 use_separator ? thousands_sep : NULL,
9486 thousands_sep_len, maxchar);
9487 }
9488 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009489}
9490
9491
Alexander Belopolsky40018472011-02-26 01:02:56 +00009492Py_ssize_t
9493PyUnicode_Count(PyObject *str,
9494 PyObject *substr,
9495 Py_ssize_t start,
9496 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009497{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009498 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009499 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009500 void *buf1 = NULL, *buf2 = NULL;
9501 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009502
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009503 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009504 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009505
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009506 kind1 = PyUnicode_KIND(str);
9507 kind2 = PyUnicode_KIND(substr);
9508 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009509 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009510
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009511 len1 = PyUnicode_GET_LENGTH(str);
9512 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009513 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009514 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009515 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009516
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009517 buf1 = PyUnicode_DATA(str);
9518 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009519 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009520 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009521 if (!buf2)
9522 goto onError;
9523 }
9524
9525 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009526 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009527 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009528 result = asciilib_count(
9529 ((Py_UCS1*)buf1) + start, end - start,
9530 buf2, len2, PY_SSIZE_T_MAX
9531 );
9532 else
9533 result = ucs1lib_count(
9534 ((Py_UCS1*)buf1) + start, end - start,
9535 buf2, len2, PY_SSIZE_T_MAX
9536 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009537 break;
9538 case PyUnicode_2BYTE_KIND:
9539 result = ucs2lib_count(
9540 ((Py_UCS2*)buf1) + start, end - start,
9541 buf2, len2, PY_SSIZE_T_MAX
9542 );
9543 break;
9544 case PyUnicode_4BYTE_KIND:
9545 result = ucs4lib_count(
9546 ((Py_UCS4*)buf1) + start, end - start,
9547 buf2, len2, PY_SSIZE_T_MAX
9548 );
9549 break;
9550 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009551 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009552 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009553
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009554 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009555 PyMem_Free(buf2);
9556
Guido van Rossumd57fd912000-03-10 22:53:23 +00009557 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009559 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 PyMem_Free(buf2);
9561 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009562}
9563
Alexander Belopolsky40018472011-02-26 01:02:56 +00009564Py_ssize_t
9565PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009566 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009567 Py_ssize_t start,
9568 Py_ssize_t end,
9569 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009571 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009572 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009573
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009574 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009575}
9576
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009577Py_ssize_t
9578PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9579 Py_ssize_t start, Py_ssize_t end,
9580 int direction)
9581{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009582 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009583 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009584 if (PyUnicode_READY(str) == -1)
9585 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009586 len = PyUnicode_GET_LENGTH(str);
9587 ADJUST_INDICES(start, end, len);
9588 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009589 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009591 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9592 kind, end-start, ch, direction);
9593 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009595 else
9596 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009597}
9598
Alexander Belopolsky40018472011-02-26 01:02:56 +00009599static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009600tailmatch(PyObject *self,
9601 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009602 Py_ssize_t start,
9603 Py_ssize_t end,
9604 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009605{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009606 int kind_self;
9607 int kind_sub;
9608 void *data_self;
9609 void *data_sub;
9610 Py_ssize_t offset;
9611 Py_ssize_t i;
9612 Py_ssize_t end_sub;
9613
9614 if (PyUnicode_READY(self) == -1 ||
9615 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009616 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009617
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009618 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9619 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009621 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009622
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009623 if (PyUnicode_GET_LENGTH(substring) == 0)
9624 return 1;
9625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 kind_self = PyUnicode_KIND(self);
9627 data_self = PyUnicode_DATA(self);
9628 kind_sub = PyUnicode_KIND(substring);
9629 data_sub = PyUnicode_DATA(substring);
9630 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9631
9632 if (direction > 0)
9633 offset = end;
9634 else
9635 offset = start;
9636
9637 if (PyUnicode_READ(kind_self, data_self, offset) ==
9638 PyUnicode_READ(kind_sub, data_sub, 0) &&
9639 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9640 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9641 /* If both are of the same kind, memcmp is sufficient */
9642 if (kind_self == kind_sub) {
9643 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009644 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009645 data_sub,
9646 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009647 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009648 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009649 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009650 else {
9651 /* We do not need to compare 0 and len(substring)-1 because
9652 the if statement above ensured already that they are equal
9653 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 for (i = 1; i < end_sub; ++i) {
9655 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9656 PyUnicode_READ(kind_sub, data_sub, i))
9657 return 0;
9658 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009659 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009661 }
9662
9663 return 0;
9664}
9665
Alexander Belopolsky40018472011-02-26 01:02:56 +00009666Py_ssize_t
9667PyUnicode_Tailmatch(PyObject *str,
9668 PyObject *substr,
9669 Py_ssize_t start,
9670 Py_ssize_t end,
9671 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009672{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009673 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009674 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009675
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009676 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009677}
9678
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009679static PyObject *
9680ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009682 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9683 char *resdata, *data = PyUnicode_DATA(self);
9684 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009685
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 res = PyUnicode_New(len, 127);
9687 if (res == NULL)
9688 return NULL;
9689 resdata = PyUnicode_DATA(res);
9690 if (lower)
9691 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009692 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693 _Py_bytes_upper(resdata, data, len);
9694 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695}
9696
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009697static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009698handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 Py_ssize_t j;
9701 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009702 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009703 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009704
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009705 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9706
9707 where ! is a negation and \p{xxx} is a character with property xxx.
9708 */
9709 for (j = i - 1; j >= 0; j--) {
9710 c = PyUnicode_READ(kind, data, j);
9711 if (!_PyUnicode_IsCaseIgnorable(c))
9712 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9715 if (final_sigma) {
9716 for (j = i + 1; j < length; j++) {
9717 c = PyUnicode_READ(kind, data, j);
9718 if (!_PyUnicode_IsCaseIgnorable(c))
9719 break;
9720 }
9721 final_sigma = j == length || !_PyUnicode_IsCased(c);
9722 }
9723 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009724}
9725
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009726static int
9727lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9728 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009729{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730 /* Obscure special case. */
9731 if (c == 0x3A3) {
9732 mapped[0] = handle_capital_sigma(kind, data, length, i);
9733 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009734 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009735 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009736}
9737
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009738static Py_ssize_t
9739do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 Py_ssize_t i, k = 0;
9742 int n_res, j;
9743 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009744
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009745 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009746 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009747 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009748 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009751 for (i = 1; i < length; i++) {
9752 c = PyUnicode_READ(kind, data, i);
9753 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9754 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009755 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009756 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009757 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009758 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009760}
9761
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762static Py_ssize_t
9763do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9764 Py_ssize_t i, k = 0;
9765
9766 for (i = 0; i < length; i++) {
9767 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9768 int n_res, j;
9769 if (Py_UNICODE_ISUPPER(c)) {
9770 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9771 }
9772 else if (Py_UNICODE_ISLOWER(c)) {
9773 n_res = _PyUnicode_ToUpperFull(c, mapped);
9774 }
9775 else {
9776 n_res = 1;
9777 mapped[0] = c;
9778 }
9779 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009780 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009781 res[k++] = mapped[j];
9782 }
9783 }
9784 return k;
9785}
9786
9787static Py_ssize_t
9788do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9789 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009790{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009791 Py_ssize_t i, k = 0;
9792
9793 for (i = 0; i < length; i++) {
9794 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9795 int n_res, j;
9796 if (lower)
9797 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9798 else
9799 n_res = _PyUnicode_ToUpperFull(c, mapped);
9800 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009801 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009802 res[k++] = mapped[j];
9803 }
9804 }
9805 return k;
9806}
9807
9808static Py_ssize_t
9809do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9810{
9811 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9812}
9813
9814static Py_ssize_t
9815do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9816{
9817 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9818}
9819
Benjamin Petersone51757f2012-01-12 21:10:29 -05009820static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009821do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9822{
9823 Py_ssize_t i, k = 0;
9824
9825 for (i = 0; i < length; i++) {
9826 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9827 Py_UCS4 mapped[3];
9828 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9829 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009830 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009831 res[k++] = mapped[j];
9832 }
9833 }
9834 return k;
9835}
9836
9837static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009838do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9839{
9840 Py_ssize_t i, k = 0;
9841 int previous_is_cased;
9842
9843 previous_is_cased = 0;
9844 for (i = 0; i < length; i++) {
9845 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9846 Py_UCS4 mapped[3];
9847 int n_res, j;
9848
9849 if (previous_is_cased)
9850 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9851 else
9852 n_res = _PyUnicode_ToTitleFull(c, mapped);
9853
9854 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009855 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009856 res[k++] = mapped[j];
9857 }
9858
9859 previous_is_cased = _PyUnicode_IsCased(c);
9860 }
9861 return k;
9862}
9863
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864static PyObject *
9865case_operation(PyObject *self,
9866 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9867{
9868 PyObject *res = NULL;
9869 Py_ssize_t length, newlength = 0;
9870 int kind, outkind;
9871 void *data, *outdata;
9872 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9873
Benjamin Petersoneea48462012-01-16 14:28:50 -05009874 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009875
9876 kind = PyUnicode_KIND(self);
9877 data = PyUnicode_DATA(self);
9878 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009879 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009880 PyErr_SetString(PyExc_OverflowError, "string is too long");
9881 return NULL;
9882 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009883 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009884 if (tmp == NULL)
9885 return PyErr_NoMemory();
9886 newlength = perform(kind, data, length, tmp, &maxchar);
9887 res = PyUnicode_New(newlength, maxchar);
9888 if (res == NULL)
9889 goto leave;
9890 tmpend = tmp + newlength;
9891 outdata = PyUnicode_DATA(res);
9892 outkind = PyUnicode_KIND(res);
9893 switch (outkind) {
9894 case PyUnicode_1BYTE_KIND:
9895 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9896 break;
9897 case PyUnicode_2BYTE_KIND:
9898 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9899 break;
9900 case PyUnicode_4BYTE_KIND:
9901 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9902 break;
9903 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009904 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009905 }
9906 leave:
9907 PyMem_FREE(tmp);
9908 return res;
9909}
9910
Tim Peters8ce9f162004-08-27 01:49:32 +00009911PyObject *
9912PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009914 PyObject *res;
9915 PyObject *fseq;
9916 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009917 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009918
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009919 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009920 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009921 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009922 }
9923
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009924 /* NOTE: the following code can't call back into Python code,
9925 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009926 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009927
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009928 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009929 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009930 res = _PyUnicode_JoinArray(separator, items, seqlen);
9931 Py_DECREF(fseq);
9932 return res;
9933}
9934
9935PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009936_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009937{
9938 PyObject *res = NULL; /* the result */
9939 PyObject *sep = NULL;
9940 Py_ssize_t seplen;
9941 PyObject *item;
9942 Py_ssize_t sz, i, res_offset;
9943 Py_UCS4 maxchar;
9944 Py_UCS4 item_maxchar;
9945 int use_memcpy;
9946 unsigned char *res_data = NULL, *sep_data = NULL;
9947 PyObject *last_obj;
9948 unsigned int kind = 0;
9949
Tim Peters05eba1f2004-08-27 21:32:02 +00009950 /* If empty sequence, return u"". */
9951 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009952 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009953 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009954
Tim Peters05eba1f2004-08-27 21:32:02 +00009955 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009956 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009957 if (seqlen == 1) {
9958 if (PyUnicode_CheckExact(items[0])) {
9959 res = items[0];
9960 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009961 return res;
9962 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009963 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009964 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009965 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009966 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009967 /* Set up sep and seplen */
9968 if (separator == NULL) {
9969 /* fall back to a blank space separator */
9970 sep = PyUnicode_FromOrdinal(' ');
9971 if (!sep)
9972 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009973 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009974 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009975 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009976 else {
9977 if (!PyUnicode_Check(separator)) {
9978 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009979 "separator: expected str instance,"
9980 " %.80s found",
9981 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009982 goto onError;
9983 }
9984 if (PyUnicode_READY(separator))
9985 goto onError;
9986 sep = separator;
9987 seplen = PyUnicode_GET_LENGTH(separator);
9988 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9989 /* inc refcount to keep this code path symmetric with the
9990 above case of a blank separator */
9991 Py_INCREF(sep);
9992 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009993 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009994 }
9995
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009996 /* There are at least two things to join, or else we have a subclass
9997 * of str in the sequence.
9998 * Do a pre-pass to figure out the total amount of space we'll
9999 * need (sz), and see whether all argument are strings.
10000 */
10001 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010002#ifdef Py_DEBUG
10003 use_memcpy = 0;
10004#else
10005 use_memcpy = 1;
10006#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010007 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010008 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010009 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010010 if (!PyUnicode_Check(item)) {
10011 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010012 "sequence item %zd: expected str instance,"
10013 " %.80s found",
10014 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010015 goto onError;
10016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010017 if (PyUnicode_READY(item) == -1)
10018 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010019 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010020 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010021 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010022 if (i != 0) {
10023 add_sz += seplen;
10024 }
10025 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010026 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010027 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010028 goto onError;
10029 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010030 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010031 if (use_memcpy && last_obj != NULL) {
10032 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10033 use_memcpy = 0;
10034 }
10035 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010036 }
Tim Petersced69f82003-09-16 20:30:58 +000010037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010038 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010039 if (res == NULL)
10040 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010041
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010042 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010043#ifdef Py_DEBUG
10044 use_memcpy = 0;
10045#else
10046 if (use_memcpy) {
10047 res_data = PyUnicode_1BYTE_DATA(res);
10048 kind = PyUnicode_KIND(res);
10049 if (seplen != 0)
10050 sep_data = PyUnicode_1BYTE_DATA(sep);
10051 }
10052#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010053 if (use_memcpy) {
10054 for (i = 0; i < seqlen; ++i) {
10055 Py_ssize_t itemlen;
10056 item = items[i];
10057
10058 /* Copy item, and maybe the separator. */
10059 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010060 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010061 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010062 kind * seplen);
10063 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010064 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010065
10066 itemlen = PyUnicode_GET_LENGTH(item);
10067 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010068 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010069 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 kind * itemlen);
10071 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010073 }
10074 assert(res_data == PyUnicode_1BYTE_DATA(res)
10075 + kind * PyUnicode_GET_LENGTH(res));
10076 }
10077 else {
10078 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10079 Py_ssize_t itemlen;
10080 item = items[i];
10081
10082 /* Copy item, and maybe the separator. */
10083 if (i && seplen != 0) {
10084 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10085 res_offset += seplen;
10086 }
10087
10088 itemlen = PyUnicode_GET_LENGTH(item);
10089 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010090 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010091 res_offset += itemlen;
10092 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010093 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010094 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010095 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010096
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010097 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010098 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010099 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010100
Benjamin Peterson29060642009-01-31 22:14:21 +000010101 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010102 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010103 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104 return NULL;
10105}
10106
Victor Stinnerd3f08822012-05-29 12:57:52 +020010107void
10108_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10109 Py_UCS4 fill_char)
10110{
10111 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010112 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010113 assert(PyUnicode_IS_READY(unicode));
10114 assert(unicode_modifiable(unicode));
10115 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10116 assert(start >= 0);
10117 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010118 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010119}
10120
Victor Stinner3fe55312012-01-04 00:33:50 +010010121Py_ssize_t
10122PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10123 Py_UCS4 fill_char)
10124{
10125 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010126
10127 if (!PyUnicode_Check(unicode)) {
10128 PyErr_BadInternalCall();
10129 return -1;
10130 }
10131 if (PyUnicode_READY(unicode) == -1)
10132 return -1;
10133 if (unicode_check_modifiable(unicode))
10134 return -1;
10135
Victor Stinnerd3f08822012-05-29 12:57:52 +020010136 if (start < 0) {
10137 PyErr_SetString(PyExc_IndexError, "string index out of range");
10138 return -1;
10139 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010140 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10141 PyErr_SetString(PyExc_ValueError,
10142 "fill character is bigger than "
10143 "the string maximum character");
10144 return -1;
10145 }
10146
10147 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10148 length = Py_MIN(maxlen, length);
10149 if (length <= 0)
10150 return 0;
10151
Victor Stinnerd3f08822012-05-29 12:57:52 +020010152 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010153 return length;
10154}
10155
Victor Stinner9310abb2011-10-05 00:59:23 +020010156static PyObject *
10157pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010158 Py_ssize_t left,
10159 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010160 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010161{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010162 PyObject *u;
10163 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010164 int kind;
10165 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010166
10167 if (left < 0)
10168 left = 0;
10169 if (right < 0)
10170 right = 0;
10171
Victor Stinnerc4b49542011-12-11 22:44:26 +010010172 if (left == 0 && right == 0)
10173 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010175 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10176 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010177 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10178 return NULL;
10179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010180 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010181 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010182 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010183 if (!u)
10184 return NULL;
10185
10186 kind = PyUnicode_KIND(u);
10187 data = PyUnicode_DATA(u);
10188 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010189 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010190 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010191 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010192 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010193 assert(_PyUnicode_CheckConsistency(u, 1));
10194 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010195}
10196
Alexander Belopolsky40018472011-02-26 01:02:56 +000010197PyObject *
10198PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010200 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010202 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010203 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204
Benjamin Petersonead6b532011-12-20 17:23:42 -060010205 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010206 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010207 if (PyUnicode_IS_ASCII(string))
10208 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010209 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010210 PyUnicode_GET_LENGTH(string), keepends);
10211 else
10212 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010213 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010215 break;
10216 case PyUnicode_2BYTE_KIND:
10217 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010218 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 PyUnicode_GET_LENGTH(string), keepends);
10220 break;
10221 case PyUnicode_4BYTE_KIND:
10222 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010223 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010224 PyUnicode_GET_LENGTH(string), keepends);
10225 break;
10226 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010227 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010229 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010230}
10231
Alexander Belopolsky40018472011-02-26 01:02:56 +000010232static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010233split(PyObject *self,
10234 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010235 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010237 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 void *buf1, *buf2;
10239 Py_ssize_t len1, len2;
10240 PyObject* out;
10241
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010243 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010245 if (PyUnicode_READY(self) == -1)
10246 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010247
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010248 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010249 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010250 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010251 if (PyUnicode_IS_ASCII(self))
10252 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010253 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010254 PyUnicode_GET_LENGTH(self), maxcount
10255 );
10256 else
10257 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010258 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010259 PyUnicode_GET_LENGTH(self), maxcount
10260 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010261 case PyUnicode_2BYTE_KIND:
10262 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010263 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 PyUnicode_GET_LENGTH(self), maxcount
10265 );
10266 case PyUnicode_4BYTE_KIND:
10267 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010268 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 PyUnicode_GET_LENGTH(self), maxcount
10270 );
10271 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010272 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 }
10274
10275 if (PyUnicode_READY(substring) == -1)
10276 return NULL;
10277
10278 kind1 = PyUnicode_KIND(self);
10279 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010280 len1 = PyUnicode_GET_LENGTH(self);
10281 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010282 if (kind1 < kind2 || len1 < len2) {
10283 out = PyList_New(1);
10284 if (out == NULL)
10285 return NULL;
10286 Py_INCREF(self);
10287 PyList_SET_ITEM(out, 0, self);
10288 return out;
10289 }
10290 buf1 = PyUnicode_DATA(self);
10291 buf2 = PyUnicode_DATA(substring);
10292 if (kind2 != kind1) {
10293 buf2 = _PyUnicode_AsKind(substring, kind1);
10294 if (!buf2)
10295 return NULL;
10296 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010297
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010298 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010299 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010300 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10301 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010302 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010303 else
10304 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010305 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010306 break;
10307 case PyUnicode_2BYTE_KIND:
10308 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 break;
10311 case PyUnicode_4BYTE_KIND:
10312 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 break;
10315 default:
10316 out = NULL;
10317 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010318 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010319 PyMem_Free(buf2);
10320 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010321}
10322
Alexander Belopolsky40018472011-02-26 01:02:56 +000010323static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010324rsplit(PyObject *self,
10325 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010326 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010327{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010328 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 void *buf1, *buf2;
10330 Py_ssize_t len1, len2;
10331 PyObject* out;
10332
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010333 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010334 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010335
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010336 if (PyUnicode_READY(self) == -1)
10337 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010338
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010339 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010340 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010341 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010342 if (PyUnicode_IS_ASCII(self))
10343 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010344 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010345 PyUnicode_GET_LENGTH(self), maxcount
10346 );
10347 else
10348 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010349 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 PyUnicode_GET_LENGTH(self), maxcount
10351 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010352 case PyUnicode_2BYTE_KIND:
10353 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010354 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 PyUnicode_GET_LENGTH(self), maxcount
10356 );
10357 case PyUnicode_4BYTE_KIND:
10358 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010359 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 PyUnicode_GET_LENGTH(self), maxcount
10361 );
10362 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010363 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 }
10365
10366 if (PyUnicode_READY(substring) == -1)
10367 return NULL;
10368
10369 kind1 = PyUnicode_KIND(self);
10370 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 len1 = PyUnicode_GET_LENGTH(self);
10372 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010373 if (kind1 < kind2 || len1 < len2) {
10374 out = PyList_New(1);
10375 if (out == NULL)
10376 return NULL;
10377 Py_INCREF(self);
10378 PyList_SET_ITEM(out, 0, self);
10379 return out;
10380 }
10381 buf1 = PyUnicode_DATA(self);
10382 buf2 = PyUnicode_DATA(substring);
10383 if (kind2 != kind1) {
10384 buf2 = _PyUnicode_AsKind(substring, kind1);
10385 if (!buf2)
10386 return NULL;
10387 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010388
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010389 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010390 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010391 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10392 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010393 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010394 else
10395 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010396 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010397 break;
10398 case PyUnicode_2BYTE_KIND:
10399 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 break;
10402 case PyUnicode_4BYTE_KIND:
10403 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 break;
10406 default:
10407 out = NULL;
10408 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010409 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010410 PyMem_Free(buf2);
10411 return out;
10412}
10413
10414static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010415anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10416 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010417{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010418 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010420 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10421 return asciilib_find(buf1, len1, buf2, len2, offset);
10422 else
10423 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 case PyUnicode_2BYTE_KIND:
10425 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10426 case PyUnicode_4BYTE_KIND:
10427 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10428 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010429 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430}
10431
10432static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010433anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10434 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010435{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010436 switch (kind) {
10437 case PyUnicode_1BYTE_KIND:
10438 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10439 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10440 else
10441 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10442 case PyUnicode_2BYTE_KIND:
10443 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10444 case PyUnicode_4BYTE_KIND:
10445 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10446 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010447 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010448}
10449
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010450static void
10451replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10452 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10453{
10454 int kind = PyUnicode_KIND(u);
10455 void *data = PyUnicode_DATA(u);
10456 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10457 if (kind == PyUnicode_1BYTE_KIND) {
10458 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10459 (Py_UCS1 *)data + len,
10460 u1, u2, maxcount);
10461 }
10462 else if (kind == PyUnicode_2BYTE_KIND) {
10463 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10464 (Py_UCS2 *)data + len,
10465 u1, u2, maxcount);
10466 }
10467 else {
10468 assert(kind == PyUnicode_4BYTE_KIND);
10469 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10470 (Py_UCS4 *)data + len,
10471 u1, u2, maxcount);
10472 }
10473}
10474
Alexander Belopolsky40018472011-02-26 01:02:56 +000010475static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010476replace(PyObject *self, PyObject *str1,
10477 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010478{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010479 PyObject *u;
10480 char *sbuf = PyUnicode_DATA(self);
10481 char *buf1 = PyUnicode_DATA(str1);
10482 char *buf2 = PyUnicode_DATA(str2);
10483 int srelease = 0, release1 = 0, release2 = 0;
10484 int skind = PyUnicode_KIND(self);
10485 int kind1 = PyUnicode_KIND(str1);
10486 int kind2 = PyUnicode_KIND(str2);
10487 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10488 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10489 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010490 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010491 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492
10493 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010494 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010495 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010496 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010497
Victor Stinner59de0ee2011-10-07 10:01:28 +020010498 if (str1 == str2)
10499 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010500
Victor Stinner49a0a212011-10-12 23:46:10 +020010501 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010502 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10503 if (maxchar < maxchar_str1)
10504 /* substring too wide to be present */
10505 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010506 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10507 /* Replacing str1 with str2 may cause a maxchar reduction in the
10508 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010509 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010510 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010511
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010512 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010513 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010515 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010517 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010518 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010519 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010520
Victor Stinner69ed0f42013-04-09 21:48:24 +020010521 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010522 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010523 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010524 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010525 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010527 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010529
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010530 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10531 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010532 }
10533 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 int rkind = skind;
10535 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010536 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010537
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 if (kind1 < rkind) {
10539 /* widen substring */
10540 buf1 = _PyUnicode_AsKind(str1, rkind);
10541 if (!buf1) goto error;
10542 release1 = 1;
10543 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010544 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010545 if (i < 0)
10546 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 if (rkind > kind2) {
10548 /* widen replacement */
10549 buf2 = _PyUnicode_AsKind(str2, rkind);
10550 if (!buf2) goto error;
10551 release2 = 1;
10552 }
10553 else if (rkind < kind2) {
10554 /* widen self and buf1 */
10555 rkind = kind2;
10556 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010557 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558 sbuf = _PyUnicode_AsKind(self, rkind);
10559 if (!sbuf) goto error;
10560 srelease = 1;
10561 buf1 = _PyUnicode_AsKind(str1, rkind);
10562 if (!buf1) goto error;
10563 release1 = 1;
10564 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010565 u = PyUnicode_New(slen, maxchar);
10566 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010567 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010568 assert(PyUnicode_KIND(u) == rkind);
10569 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010570
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010571 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010572 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010573 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010574 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010575 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010576 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010577
10578 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010579 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010580 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010581 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010582 if (i == -1)
10583 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010584 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010585 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010587 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010588 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010589 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010590 }
10591 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010592 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010593 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010594 int rkind = skind;
10595 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010596
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010597 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 buf1 = _PyUnicode_AsKind(str1, rkind);
10600 if (!buf1) goto error;
10601 release1 = 1;
10602 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010603 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010604 if (n == 0)
10605 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010607 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 buf2 = _PyUnicode_AsKind(str2, rkind);
10609 if (!buf2) goto error;
10610 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010611 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 rkind = kind2;
10615 sbuf = _PyUnicode_AsKind(self, rkind);
10616 if (!sbuf) goto error;
10617 srelease = 1;
10618 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010619 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 buf1 = _PyUnicode_AsKind(str1, rkind);
10621 if (!buf1) goto error;
10622 release1 = 1;
10623 }
10624 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10625 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010626 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010627 PyErr_SetString(PyExc_OverflowError,
10628 "replace string is too long");
10629 goto error;
10630 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010631 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010632 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010633 _Py_INCREF_UNICODE_EMPTY();
10634 if (!unicode_empty)
10635 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010636 u = unicode_empty;
10637 goto done;
10638 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010639 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 PyErr_SetString(PyExc_OverflowError,
10641 "replace string is too long");
10642 goto error;
10643 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010644 u = PyUnicode_New(new_size, maxchar);
10645 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010647 assert(PyUnicode_KIND(u) == rkind);
10648 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010649 ires = i = 0;
10650 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010651 while (n-- > 0) {
10652 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010653 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010654 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010655 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010656 if (j == -1)
10657 break;
10658 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 memcpy(res + rkind * ires,
10661 sbuf + rkind * i,
10662 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010664 }
10665 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010666 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010667 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010668 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010669 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010676 memcpy(res + rkind * ires,
10677 sbuf + rkind * i,
10678 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010679 }
10680 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 /* interleave */
10682 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010683 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010685 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 if (--n <= 0)
10688 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010689 memcpy(res + rkind * ires,
10690 sbuf + rkind * i,
10691 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 ires++;
10693 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010694 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010695 memcpy(res + rkind * ires,
10696 sbuf + rkind * i,
10697 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010699 }
10700
10701 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010702 unicode_adjust_maxchar(&u);
10703 if (u == NULL)
10704 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010705 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010706
10707 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010708 if (srelease)
10709 PyMem_FREE(sbuf);
10710 if (release1)
10711 PyMem_FREE(buf1);
10712 if (release2)
10713 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010714 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010715 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010716
Benjamin Peterson29060642009-01-31 22:14:21 +000010717 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010718 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 if (srelease)
10720 PyMem_FREE(sbuf);
10721 if (release1)
10722 PyMem_FREE(buf1);
10723 if (release2)
10724 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010725 return unicode_result_unchanged(self);
10726
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 error:
10728 if (srelease && sbuf)
10729 PyMem_FREE(sbuf);
10730 if (release1 && buf1)
10731 PyMem_FREE(buf1);
10732 if (release2 && buf2)
10733 PyMem_FREE(buf2);
10734 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010735}
10736
10737/* --- Unicode Object Methods --------------------------------------------- */
10738
INADA Naoki3ae20562017-01-16 20:41:20 +090010739/*[clinic input]
10740str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741
INADA Naoki3ae20562017-01-16 20:41:20 +090010742Return a version of the string where each word is titlecased.
10743
10744More specifically, words start with uppercased characters and all remaining
10745cased characters have lower case.
10746[clinic start generated code]*/
10747
10748static PyObject *
10749unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010750/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010751{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010752 if (PyUnicode_READY(self) == -1)
10753 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010754 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755}
10756
INADA Naoki3ae20562017-01-16 20:41:20 +090010757/*[clinic input]
10758str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759
INADA Naoki3ae20562017-01-16 20:41:20 +090010760Return a capitalized version of the string.
10761
10762More specifically, make the first character have upper case and the rest lower
10763case.
10764[clinic start generated code]*/
10765
10766static PyObject *
10767unicode_capitalize_impl(PyObject *self)
10768/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010770 if (PyUnicode_READY(self) == -1)
10771 return NULL;
10772 if (PyUnicode_GET_LENGTH(self) == 0)
10773 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010774 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775}
10776
INADA Naoki3ae20562017-01-16 20:41:20 +090010777/*[clinic input]
10778str.casefold as unicode_casefold
10779
10780Return a version of the string suitable for caseless comparisons.
10781[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010782
10783static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010784unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010785/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010786{
10787 if (PyUnicode_READY(self) == -1)
10788 return NULL;
10789 if (PyUnicode_IS_ASCII(self))
10790 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010791 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010792}
10793
10794
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010795/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010796
10797static int
10798convert_uc(PyObject *obj, void *addr)
10799{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010801
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010802 if (!PyUnicode_Check(obj)) {
10803 PyErr_Format(PyExc_TypeError,
10804 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010805 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010806 return 0;
10807 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010808 if (PyUnicode_READY(obj) < 0)
10809 return 0;
10810 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010811 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010812 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010813 return 0;
10814 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010815 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010816 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010817}
10818
INADA Naoki3ae20562017-01-16 20:41:20 +090010819/*[clinic input]
10820str.center as unicode_center
10821
10822 width: Py_ssize_t
10823 fillchar: Py_UCS4 = ' '
10824 /
10825
10826Return a centered string of length width.
10827
10828Padding is done using the specified fill character (default is a space).
10829[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010830
10831static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010832unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10833/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010835 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
Benjamin Petersonbac79492012-01-14 13:34:47 -050010837 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838 return NULL;
10839
Victor Stinnerc4b49542011-12-11 22:44:26 +010010840 if (PyUnicode_GET_LENGTH(self) >= width)
10841 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
Victor Stinnerc4b49542011-12-11 22:44:26 +010010843 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 left = marg / 2 + (marg & width & 1);
10845
Victor Stinner9310abb2011-10-05 00:59:23 +020010846 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010847}
10848
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010849/* This function assumes that str1 and str2 are readied by the caller. */
10850
Marc-André Lemburge5034372000-08-08 08:04:29 +000010851static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010852unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010853{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010854#define COMPARE(TYPE1, TYPE2) \
10855 do { \
10856 TYPE1* p1 = (TYPE1 *)data1; \
10857 TYPE2* p2 = (TYPE2 *)data2; \
10858 TYPE1* end = p1 + len; \
10859 Py_UCS4 c1, c2; \
10860 for (; p1 != end; p1++, p2++) { \
10861 c1 = *p1; \
10862 c2 = *p2; \
10863 if (c1 != c2) \
10864 return (c1 < c2) ? -1 : 1; \
10865 } \
10866 } \
10867 while (0)
10868
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010869 int kind1, kind2;
10870 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010871 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 kind1 = PyUnicode_KIND(str1);
10874 kind2 = PyUnicode_KIND(str2);
10875 data1 = PyUnicode_DATA(str1);
10876 data2 = PyUnicode_DATA(str2);
10877 len1 = PyUnicode_GET_LENGTH(str1);
10878 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010879 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010880
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010881 switch(kind1) {
10882 case PyUnicode_1BYTE_KIND:
10883 {
10884 switch(kind2) {
10885 case PyUnicode_1BYTE_KIND:
10886 {
10887 int cmp = memcmp(data1, data2, len);
10888 /* normalize result of memcmp() into the range [-1; 1] */
10889 if (cmp < 0)
10890 return -1;
10891 if (cmp > 0)
10892 return 1;
10893 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010894 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010895 case PyUnicode_2BYTE_KIND:
10896 COMPARE(Py_UCS1, Py_UCS2);
10897 break;
10898 case PyUnicode_4BYTE_KIND:
10899 COMPARE(Py_UCS1, Py_UCS4);
10900 break;
10901 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010902 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010903 }
10904 break;
10905 }
10906 case PyUnicode_2BYTE_KIND:
10907 {
10908 switch(kind2) {
10909 case PyUnicode_1BYTE_KIND:
10910 COMPARE(Py_UCS2, Py_UCS1);
10911 break;
10912 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010913 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010914 COMPARE(Py_UCS2, Py_UCS2);
10915 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010916 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010917 case PyUnicode_4BYTE_KIND:
10918 COMPARE(Py_UCS2, Py_UCS4);
10919 break;
10920 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010921 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010922 }
10923 break;
10924 }
10925 case PyUnicode_4BYTE_KIND:
10926 {
10927 switch(kind2) {
10928 case PyUnicode_1BYTE_KIND:
10929 COMPARE(Py_UCS4, Py_UCS1);
10930 break;
10931 case PyUnicode_2BYTE_KIND:
10932 COMPARE(Py_UCS4, Py_UCS2);
10933 break;
10934 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010935 {
10936#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10937 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10938 /* normalize result of wmemcmp() into the range [-1; 1] */
10939 if (cmp < 0)
10940 return -1;
10941 if (cmp > 0)
10942 return 1;
10943#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010944 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010945#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010946 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010947 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010948 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010949 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010950 }
10951 break;
10952 }
10953 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010954 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010955 }
10956
Victor Stinner770e19e2012-10-04 22:59:45 +020010957 if (len1 == len2)
10958 return 0;
10959 if (len1 < len2)
10960 return -1;
10961 else
10962 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010963
10964#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010965}
10966
Benjamin Peterson621b4302016-09-09 13:54:34 -070010967static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010968unicode_compare_eq(PyObject *str1, PyObject *str2)
10969{
10970 int kind;
10971 void *data1, *data2;
10972 Py_ssize_t len;
10973 int cmp;
10974
Victor Stinnere5567ad2012-10-23 02:48:49 +020010975 len = PyUnicode_GET_LENGTH(str1);
10976 if (PyUnicode_GET_LENGTH(str2) != len)
10977 return 0;
10978 kind = PyUnicode_KIND(str1);
10979 if (PyUnicode_KIND(str2) != kind)
10980 return 0;
10981 data1 = PyUnicode_DATA(str1);
10982 data2 = PyUnicode_DATA(str2);
10983
10984 cmp = memcmp(data1, data2, len * kind);
10985 return (cmp == 0);
10986}
10987
10988
Alexander Belopolsky40018472011-02-26 01:02:56 +000010989int
10990PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010991{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010992 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10993 if (PyUnicode_READY(left) == -1 ||
10994 PyUnicode_READY(right) == -1)
10995 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010010996
10997 /* a string is equal to itself */
10998 if (left == right)
10999 return 0;
11000
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011001 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011002 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011003 PyErr_Format(PyExc_TypeError,
11004 "Can't compare %.100s and %.100s",
11005 left->ob_type->tp_name,
11006 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011007 return -1;
11008}
11009
Martin v. Löwis5b222132007-06-10 09:51:05 +000011010int
11011PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011013 Py_ssize_t i;
11014 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011015 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011016 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017
Victor Stinner910337b2011-10-03 03:20:16 +020011018 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011019 if (!PyUnicode_IS_READY(uni)) {
11020 const wchar_t *ws = _PyUnicode_WSTR(uni);
11021 /* Compare Unicode string and source character set string */
11022 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11023 if (chr != ustr[i])
11024 return (chr < ustr[i]) ? -1 : 1;
11025 }
11026 /* This check keeps Python strings that end in '\0' from comparing equal
11027 to C strings identical up to that point. */
11028 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11029 return 1; /* uni is longer */
11030 if (ustr[i])
11031 return -1; /* str is longer */
11032 return 0;
11033 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011034 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011035 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011036 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011037 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011038 size_t len, len2 = strlen(str);
11039 int cmp;
11040
11041 len = Py_MIN(len1, len2);
11042 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011043 if (cmp != 0) {
11044 if (cmp < 0)
11045 return -1;
11046 else
11047 return 1;
11048 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011049 if (len1 > len2)
11050 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011051 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011052 return -1; /* str is longer */
11053 return 0;
11054 }
11055 else {
11056 void *data = PyUnicode_DATA(uni);
11057 /* Compare Unicode string and source character set string */
11058 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011059 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011060 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11061 /* This check keeps Python strings that end in '\0' from comparing equal
11062 to C strings identical up to that point. */
11063 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11064 return 1; /* uni is longer */
11065 if (str[i])
11066 return -1; /* str is longer */
11067 return 0;
11068 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011069}
11070
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011071static int
11072non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11073{
11074 size_t i, len;
11075 const wchar_t *p;
11076 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11077 if (strlen(str) != len)
11078 return 0;
11079 p = _PyUnicode_WSTR(unicode);
11080 assert(p);
11081 for (i = 0; i < len; i++) {
11082 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011083 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011084 return 0;
11085 }
11086 return 1;
11087}
11088
11089int
11090_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11091{
11092 size_t len;
11093 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011094 assert(str);
11095#ifndef NDEBUG
11096 for (const char *p = str; *p; p++) {
11097 assert((unsigned char)*p < 128);
11098 }
11099#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011100 if (PyUnicode_READY(unicode) == -1) {
11101 /* Memory error or bad data */
11102 PyErr_Clear();
11103 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11104 }
11105 if (!PyUnicode_IS_ASCII(unicode))
11106 return 0;
11107 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11108 return strlen(str) == len &&
11109 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11110}
11111
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011112int
11113_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11114{
11115 PyObject *right_uni;
11116 Py_hash_t hash;
11117
11118 assert(_PyUnicode_CHECK(left));
11119 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011120#ifndef NDEBUG
11121 for (const char *p = right->string; *p; p++) {
11122 assert((unsigned char)*p < 128);
11123 }
11124#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011125
11126 if (PyUnicode_READY(left) == -1) {
11127 /* memory error or bad data */
11128 PyErr_Clear();
11129 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11130 }
11131
11132 if (!PyUnicode_IS_ASCII(left))
11133 return 0;
11134
11135 right_uni = _PyUnicode_FromId(right); /* borrowed */
11136 if (right_uni == NULL) {
11137 /* memory error or bad data */
11138 PyErr_Clear();
11139 return _PyUnicode_EqualToASCIIString(left, right->string);
11140 }
11141
11142 if (left == right_uni)
11143 return 1;
11144
11145 if (PyUnicode_CHECK_INTERNED(left))
11146 return 0;
11147
INADA Naoki7cc95f52018-01-28 02:07:09 +090011148 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011149 hash = _PyUnicode_HASH(left);
11150 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11151 return 0;
11152
11153 return unicode_compare_eq(left, right_uni);
11154}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011155
Alexander Belopolsky40018472011-02-26 01:02:56 +000011156PyObject *
11157PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011158{
11159 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011160
Victor Stinnere5567ad2012-10-23 02:48:49 +020011161 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11162 Py_RETURN_NOTIMPLEMENTED;
11163
11164 if (PyUnicode_READY(left) == -1 ||
11165 PyUnicode_READY(right) == -1)
11166 return NULL;
11167
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011168 if (left == right) {
11169 switch (op) {
11170 case Py_EQ:
11171 case Py_LE:
11172 case Py_GE:
11173 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011174 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011175 case Py_NE:
11176 case Py_LT:
11177 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011178 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011179 default:
11180 PyErr_BadArgument();
11181 return NULL;
11182 }
11183 }
11184 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011185 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011186 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011187 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011188 }
11189 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011190 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011191 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011192 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011193}
11194
Alexander Belopolsky40018472011-02-26 01:02:56 +000011195int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011196_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11197{
11198 return unicode_eq(aa, bb);
11199}
11200
11201int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011202PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011203{
Victor Stinner77282cb2013-04-14 19:22:47 +020011204 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011205 void *buf1, *buf2;
11206 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011207 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011208
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011209 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011210 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011211 "'in <string>' requires string as left operand, not %.100s",
11212 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011213 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011214 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011215 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011216 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011217 if (ensure_unicode(str) < 0)
11218 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011220 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 kind2 = PyUnicode_KIND(substr);
11222 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011223 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 len2 = PyUnicode_GET_LENGTH(substr);
11226 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011227 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011228 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011230 if (len2 == 1) {
11231 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11232 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011233 return result;
11234 }
11235 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011236 buf2 = _PyUnicode_AsKind(substr, kind1);
11237 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011238 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011240
Victor Stinner77282cb2013-04-14 19:22:47 +020011241 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011242 case PyUnicode_1BYTE_KIND:
11243 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11244 break;
11245 case PyUnicode_2BYTE_KIND:
11246 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11247 break;
11248 case PyUnicode_4BYTE_KIND:
11249 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11250 break;
11251 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011252 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011253 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011254
Victor Stinner77282cb2013-04-14 19:22:47 +020011255 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 PyMem_Free(buf2);
11257
Guido van Rossum403d68b2000-03-13 15:55:09 +000011258 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011259}
11260
Guido van Rossumd57fd912000-03-10 22:53:23 +000011261/* Concat to string or Unicode object giving a new Unicode object. */
11262
Alexander Belopolsky40018472011-02-26 01:02:56 +000011263PyObject *
11264PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011266 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011267 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011268 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011270 if (ensure_unicode(left) < 0)
11271 return NULL;
11272
11273 if (!PyUnicode_Check(right)) {
11274 PyErr_Format(PyExc_TypeError,
11275 "can only concatenate str (not \"%.200s\") to str",
11276 right->ob_type->tp_name);
11277 return NULL;
11278 }
11279 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011280 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011281
11282 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011283 if (left == unicode_empty)
11284 return PyUnicode_FromObject(right);
11285 if (right == unicode_empty)
11286 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011288 left_len = PyUnicode_GET_LENGTH(left);
11289 right_len = PyUnicode_GET_LENGTH(right);
11290 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011291 PyErr_SetString(PyExc_OverflowError,
11292 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011293 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011294 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011295 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011296
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11298 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011299 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011300
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 result = PyUnicode_New(new_len, maxchar);
11303 if (result == NULL)
11304 return NULL;
11305 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11306 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11307 assert(_PyUnicode_CheckConsistency(result, 1));
11308 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309}
11310
Walter Dörwald1ab83302007-05-18 17:15:44 +000011311void
Victor Stinner23e56682011-10-03 03:54:37 +020011312PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011313{
Victor Stinner23e56682011-10-03 03:54:37 +020011314 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011315 Py_UCS4 maxchar, maxchar2;
11316 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011317
11318 if (p_left == NULL) {
11319 if (!PyErr_Occurred())
11320 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011321 return;
11322 }
Victor Stinner23e56682011-10-03 03:54:37 +020011323 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011324 if (right == NULL || left == NULL
11325 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011326 if (!PyErr_Occurred())
11327 PyErr_BadInternalCall();
11328 goto error;
11329 }
11330
Benjamin Petersonbac79492012-01-14 13:34:47 -050011331 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011332 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011333 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011334 goto error;
11335
Victor Stinner488fa492011-12-12 00:01:39 +010011336 /* Shortcuts */
11337 if (left == unicode_empty) {
11338 Py_DECREF(left);
11339 Py_INCREF(right);
11340 *p_left = right;
11341 return;
11342 }
11343 if (right == unicode_empty)
11344 return;
11345
11346 left_len = PyUnicode_GET_LENGTH(left);
11347 right_len = PyUnicode_GET_LENGTH(right);
11348 if (left_len > PY_SSIZE_T_MAX - right_len) {
11349 PyErr_SetString(PyExc_OverflowError,
11350 "strings are too large to concat");
11351 goto error;
11352 }
11353 new_len = left_len + right_len;
11354
11355 if (unicode_modifiable(left)
11356 && PyUnicode_CheckExact(right)
11357 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011358 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11359 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011360 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011361 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011362 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11363 {
11364 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011365 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011366 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011367
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011368 /* copy 'right' into the newly allocated area of 'left' */
11369 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011370 }
Victor Stinner488fa492011-12-12 00:01:39 +010011371 else {
11372 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11373 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011374 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011375
Victor Stinner488fa492011-12-12 00:01:39 +010011376 /* Concat the two Unicode strings */
11377 res = PyUnicode_New(new_len, maxchar);
11378 if (res == NULL)
11379 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011380 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11381 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011382 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011383 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011384 }
11385 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011386 return;
11387
11388error:
Victor Stinner488fa492011-12-12 00:01:39 +010011389 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011390}
11391
11392void
11393PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11394{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011395 PyUnicode_Append(pleft, right);
11396 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011397}
11398
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011399/*
11400Wraps stringlib_parse_args_finds() and additionally ensures that the
11401first argument is a unicode object.
11402*/
11403
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011404static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011405parse_args_finds_unicode(const char * function_name, PyObject *args,
11406 PyObject **substring,
11407 Py_ssize_t *start, Py_ssize_t *end)
11408{
11409 if(stringlib_parse_args_finds(function_name, args, substring,
11410 start, end)) {
11411 if (ensure_unicode(*substring) < 0)
11412 return 0;
11413 return 1;
11414 }
11415 return 0;
11416}
11417
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011418PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011419 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011420\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011421Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011422string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011423interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424
11425static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011426unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011427{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011428 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011429 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011430 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011432 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011433 void *buf1, *buf2;
11434 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011436 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011437 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011438
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 kind1 = PyUnicode_KIND(self);
11440 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011441 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011442 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011443
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011444 len1 = PyUnicode_GET_LENGTH(self);
11445 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011446 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011447 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011448 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011450 buf1 = PyUnicode_DATA(self);
11451 buf2 = PyUnicode_DATA(substring);
11452 if (kind2 != kind1) {
11453 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011454 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011455 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011456 }
11457 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 case PyUnicode_1BYTE_KIND:
11459 iresult = ucs1lib_count(
11460 ((Py_UCS1*)buf1) + start, end - start,
11461 buf2, len2, PY_SSIZE_T_MAX
11462 );
11463 break;
11464 case PyUnicode_2BYTE_KIND:
11465 iresult = ucs2lib_count(
11466 ((Py_UCS2*)buf1) + start, end - start,
11467 buf2, len2, PY_SSIZE_T_MAX
11468 );
11469 break;
11470 case PyUnicode_4BYTE_KIND:
11471 iresult = ucs4lib_count(
11472 ((Py_UCS4*)buf1) + start, end - start,
11473 buf2, len2, PY_SSIZE_T_MAX
11474 );
11475 break;
11476 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011477 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011478 }
11479
11480 result = PyLong_FromSsize_t(iresult);
11481
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011482 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011483 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011484
Guido van Rossumd57fd912000-03-10 22:53:23 +000011485 return result;
11486}
11487
INADA Naoki3ae20562017-01-16 20:41:20 +090011488/*[clinic input]
11489str.encode as unicode_encode
11490
11491 encoding: str(c_default="NULL") = 'utf-8'
11492 The encoding in which to encode the string.
11493 errors: str(c_default="NULL") = 'strict'
11494 The error handling scheme to use for encoding errors.
11495 The default is 'strict' meaning that encoding errors raise a
11496 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11497 'xmlcharrefreplace' as well as any other name registered with
11498 codecs.register_error that can handle UnicodeEncodeErrors.
11499
11500Encode the string using the codec registered for encoding.
11501[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011502
11503static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011504unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011505/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011507 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011508}
11509
INADA Naoki3ae20562017-01-16 20:41:20 +090011510/*[clinic input]
11511str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512
INADA Naoki3ae20562017-01-16 20:41:20 +090011513 tabsize: int = 8
11514
11515Return a copy where all tab characters are expanded using spaces.
11516
11517If tabsize is not given, a tab size of 8 characters is assumed.
11518[clinic start generated code]*/
11519
11520static PyObject *
11521unicode_expandtabs_impl(PyObject *self, int tabsize)
11522/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011523{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011524 Py_ssize_t i, j, line_pos, src_len, incr;
11525 Py_UCS4 ch;
11526 PyObject *u;
11527 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011528 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011529 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011530
Antoine Pitrou22425222011-10-04 19:10:51 +020011531 if (PyUnicode_READY(self) == -1)
11532 return NULL;
11533
Thomas Wouters7e474022000-07-16 12:04:32 +000011534 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011535 src_len = PyUnicode_GET_LENGTH(self);
11536 i = j = line_pos = 0;
11537 kind = PyUnicode_KIND(self);
11538 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011539 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011540 for (; i < src_len; i++) {
11541 ch = PyUnicode_READ(kind, src_data, i);
11542 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011543 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011544 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011545 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011546 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011547 goto overflow;
11548 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011549 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011550 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011554 goto overflow;
11555 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011557 if (ch == '\n' || ch == '\r')
11558 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011560 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011561 if (!found)
11562 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011563
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 if (!u)
11567 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011569
Antoine Pitroue71d5742011-10-04 15:55:09 +020011570 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011571
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 for (; i < src_len; i++) {
11573 ch = PyUnicode_READ(kind, src_data, i);
11574 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011575 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 incr = tabsize - (line_pos % tabsize);
11577 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011578 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011579 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011581 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011582 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011583 line_pos++;
11584 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011585 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011586 if (ch == '\n' || ch == '\r')
11587 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011588 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011589 }
11590 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011591 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011592
Antoine Pitroue71d5742011-10-04 15:55:09 +020011593 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011594 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596}
11597
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011598PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011599 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600\n\
11601Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011602such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011603arguments start and end are interpreted as in slice notation.\n\
11604\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011605Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606
11607static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011608unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011610 /* initialize variables to prevent gcc warning */
11611 PyObject *substring = NULL;
11612 Py_ssize_t start = 0;
11613 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011614 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011616 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011618
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011619 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011620 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011621
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011622 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 if (result == -2)
11625 return NULL;
11626
Christian Heimes217cfd12007-12-02 14:31:20 +000011627 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011628}
11629
11630static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011631unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011633 void *data;
11634 enum PyUnicode_Kind kind;
11635 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011636
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011637 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011638 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011639 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011640 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011641 if (PyUnicode_READY(self) == -1) {
11642 return NULL;
11643 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11645 PyErr_SetString(PyExc_IndexError, "string index out of range");
11646 return NULL;
11647 }
11648 kind = PyUnicode_KIND(self);
11649 data = PyUnicode_DATA(self);
11650 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011651 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011652}
11653
Guido van Rossumc2504932007-09-18 19:42:40 +000011654/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011655 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011656static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011657unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011659 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011660
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011661#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011662 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011663#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011664 if (_PyUnicode_HASH(self) != -1)
11665 return _PyUnicode_HASH(self);
11666 if (PyUnicode_READY(self) == -1)
11667 return -1;
animalizea1d14252019-01-02 20:16:06 +080011668
Christian Heimes985ecdc2013-11-20 11:46:18 +010011669 x = _Py_HashBytes(PyUnicode_DATA(self),
11670 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011671 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011672 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011673}
11674
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011675PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677\n\
oldkaa0735f2018-02-02 16:52:55 +080011678Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011679such that sub is contained within S[start:end]. Optional\n\
11680arguments start and end are interpreted as in slice notation.\n\
11681\n\
11682Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683
11684static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011686{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011687 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011688 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011689 PyObject *substring = NULL;
11690 Py_ssize_t start = 0;
11691 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011693 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011695
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011696 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011697 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011698
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011699 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 if (result == -2)
11702 return NULL;
11703
Guido van Rossumd57fd912000-03-10 22:53:23 +000011704 if (result < 0) {
11705 PyErr_SetString(PyExc_ValueError, "substring not found");
11706 return NULL;
11707 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708
Christian Heimes217cfd12007-12-02 14:31:20 +000011709 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710}
11711
INADA Naoki3ae20562017-01-16 20:41:20 +090011712/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011713str.isascii as unicode_isascii
11714
11715Return True if all characters in the string are ASCII, False otherwise.
11716
11717ASCII characters have code points in the range U+0000-U+007F.
11718Empty string is ASCII too.
11719[clinic start generated code]*/
11720
11721static PyObject *
11722unicode_isascii_impl(PyObject *self)
11723/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11724{
11725 if (PyUnicode_READY(self) == -1) {
11726 return NULL;
11727 }
11728 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11729}
11730
11731/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011732str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011733
INADA Naoki3ae20562017-01-16 20:41:20 +090011734Return True if the string is a lowercase string, False otherwise.
11735
11736A string is lowercase if all cased characters in the string are lowercase and
11737there is at least one cased character in the string.
11738[clinic start generated code]*/
11739
11740static PyObject *
11741unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011742/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011743{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011744 Py_ssize_t i, length;
11745 int kind;
11746 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747 int cased;
11748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011749 if (PyUnicode_READY(self) == -1)
11750 return NULL;
11751 length = PyUnicode_GET_LENGTH(self);
11752 kind = PyUnicode_KIND(self);
11753 data = PyUnicode_DATA(self);
11754
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 if (length == 1)
11757 return PyBool_FromLong(
11758 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011760 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011761 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011762 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011763
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 for (i = 0; i < length; i++) {
11766 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011767
Benjamin Peterson29060642009-01-31 22:14:21 +000011768 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011769 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011770 else if (!cased && Py_UNICODE_ISLOWER(ch))
11771 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011773 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011774}
11775
INADA Naoki3ae20562017-01-16 20:41:20 +090011776/*[clinic input]
11777str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778
INADA Naoki3ae20562017-01-16 20:41:20 +090011779Return True if the string is an uppercase string, False otherwise.
11780
11781A string is uppercase if all cased characters in the string are uppercase and
11782there is at least one cased character in the string.
11783[clinic start generated code]*/
11784
11785static PyObject *
11786unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011787/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011789 Py_ssize_t i, length;
11790 int kind;
11791 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792 int cased;
11793
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011794 if (PyUnicode_READY(self) == -1)
11795 return NULL;
11796 length = PyUnicode_GET_LENGTH(self);
11797 kind = PyUnicode_KIND(self);
11798 data = PyUnicode_DATA(self);
11799
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011801 if (length == 1)
11802 return PyBool_FromLong(
11803 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011805 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011806 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011807 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011808
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 for (i = 0; i < length; i++) {
11811 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011812
Benjamin Peterson29060642009-01-31 22:14:21 +000011813 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011814 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011815 else if (!cased && Py_UNICODE_ISUPPER(ch))
11816 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011818 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011819}
11820
INADA Naoki3ae20562017-01-16 20:41:20 +090011821/*[clinic input]
11822str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823
INADA Naoki3ae20562017-01-16 20:41:20 +090011824Return True if the string is a title-cased string, False otherwise.
11825
11826In a title-cased string, upper- and title-case characters may only
11827follow uncased characters and lowercase characters only cased ones.
11828[clinic start generated code]*/
11829
11830static PyObject *
11831unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011832/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011834 Py_ssize_t i, length;
11835 int kind;
11836 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837 int cased, previous_is_cased;
11838
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011839 if (PyUnicode_READY(self) == -1)
11840 return NULL;
11841 length = PyUnicode_GET_LENGTH(self);
11842 kind = PyUnicode_KIND(self);
11843 data = PyUnicode_DATA(self);
11844
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011846 if (length == 1) {
11847 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11848 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11849 (Py_UNICODE_ISUPPER(ch) != 0));
11850 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011852 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011854 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011855
Guido van Rossumd57fd912000-03-10 22:53:23 +000011856 cased = 0;
11857 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011858 for (i = 0; i < length; i++) {
11859 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011860
Benjamin Peterson29060642009-01-31 22:14:21 +000011861 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11862 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011863 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011864 previous_is_cased = 1;
11865 cased = 1;
11866 }
11867 else if (Py_UNICODE_ISLOWER(ch)) {
11868 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011869 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 previous_is_cased = 1;
11871 cased = 1;
11872 }
11873 else
11874 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011876 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011877}
11878
INADA Naoki3ae20562017-01-16 20:41:20 +090011879/*[clinic input]
11880str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881
INADA Naoki3ae20562017-01-16 20:41:20 +090011882Return True if the string is a whitespace string, False otherwise.
11883
11884A string is whitespace if all characters in the string are whitespace and there
11885is at least one character in the string.
11886[clinic start generated code]*/
11887
11888static PyObject *
11889unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011890/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011892 Py_ssize_t i, length;
11893 int kind;
11894 void *data;
11895
11896 if (PyUnicode_READY(self) == -1)
11897 return NULL;
11898 length = PyUnicode_GET_LENGTH(self);
11899 kind = PyUnicode_KIND(self);
11900 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011901
Guido van Rossumd57fd912000-03-10 22:53:23 +000011902 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011903 if (length == 1)
11904 return PyBool_FromLong(
11905 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011907 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011908 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011909 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011910
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 for (i = 0; i < length; i++) {
11912 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011913 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011914 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011916 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011917}
11918
INADA Naoki3ae20562017-01-16 20:41:20 +090011919/*[clinic input]
11920str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011921
INADA Naoki3ae20562017-01-16 20:41:20 +090011922Return True if the string is an alphabetic string, False otherwise.
11923
11924A string is alphabetic if all characters in the string are alphabetic and there
11925is at least one character in the string.
11926[clinic start generated code]*/
11927
11928static PyObject *
11929unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011930/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011932 Py_ssize_t i, length;
11933 int kind;
11934 void *data;
11935
11936 if (PyUnicode_READY(self) == -1)
11937 return NULL;
11938 length = PyUnicode_GET_LENGTH(self);
11939 kind = PyUnicode_KIND(self);
11940 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011941
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011942 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011943 if (length == 1)
11944 return PyBool_FromLong(
11945 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011946
11947 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011948 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011949 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011950
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 for (i = 0; i < length; i++) {
11952 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011953 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011954 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011955 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011956}
11957
INADA Naoki3ae20562017-01-16 20:41:20 +090011958/*[clinic input]
11959str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011960
INADA Naoki3ae20562017-01-16 20:41:20 +090011961Return True if the string is an alpha-numeric string, False otherwise.
11962
11963A string is alpha-numeric if all characters in the string are alpha-numeric and
11964there is at least one character in the string.
11965[clinic start generated code]*/
11966
11967static PyObject *
11968unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011969/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011970{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 int kind;
11972 void *data;
11973 Py_ssize_t len, i;
11974
11975 if (PyUnicode_READY(self) == -1)
11976 return NULL;
11977
11978 kind = PyUnicode_KIND(self);
11979 data = PyUnicode_DATA(self);
11980 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011981
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011982 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011983 if (len == 1) {
11984 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11985 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11986 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011987
11988 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011990 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011991
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011992 for (i = 0; i < len; i++) {
11993 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011994 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011995 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011996 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011997 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011998}
11999
INADA Naoki3ae20562017-01-16 20:41:20 +090012000/*[clinic input]
12001str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012002
INADA Naoki3ae20562017-01-16 20:41:20 +090012003Return True if the string is a decimal string, False otherwise.
12004
12005A string is a decimal string if all characters in the string are decimal and
12006there is at least one character in the string.
12007[clinic start generated code]*/
12008
12009static PyObject *
12010unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012011/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012012{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012013 Py_ssize_t i, length;
12014 int kind;
12015 void *data;
12016
12017 if (PyUnicode_READY(self) == -1)
12018 return NULL;
12019 length = PyUnicode_GET_LENGTH(self);
12020 kind = PyUnicode_KIND(self);
12021 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012022
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 if (length == 1)
12025 return PyBool_FromLong(
12026 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012028 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012029 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012030 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 for (i = 0; i < length; i++) {
12033 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012034 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012036 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037}
12038
INADA Naoki3ae20562017-01-16 20:41:20 +090012039/*[clinic input]
12040str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
INADA Naoki3ae20562017-01-16 20:41:20 +090012042Return True if the string is a digit string, False otherwise.
12043
12044A string is a digit string if all characters in the string are digits and there
12045is at least one character in the string.
12046[clinic start generated code]*/
12047
12048static PyObject *
12049unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012050/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012052 Py_ssize_t i, length;
12053 int kind;
12054 void *data;
12055
12056 if (PyUnicode_READY(self) == -1)
12057 return NULL;
12058 length = PyUnicode_GET_LENGTH(self);
12059 kind = PyUnicode_KIND(self);
12060 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012061
Guido van Rossumd57fd912000-03-10 22:53:23 +000012062 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012063 if (length == 1) {
12064 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12065 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12066 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012068 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012070 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012072 for (i = 0; i < length; i++) {
12073 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012074 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012076 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012077}
12078
INADA Naoki3ae20562017-01-16 20:41:20 +090012079/*[clinic input]
12080str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
INADA Naoki3ae20562017-01-16 20:41:20 +090012082Return True if the string is a numeric string, False otherwise.
12083
12084A string is numeric if all characters in the string are numeric and there is at
12085least one character in the string.
12086[clinic start generated code]*/
12087
12088static PyObject *
12089unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012090/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012092 Py_ssize_t i, length;
12093 int kind;
12094 void *data;
12095
12096 if (PyUnicode_READY(self) == -1)
12097 return NULL;
12098 length = PyUnicode_GET_LENGTH(self);
12099 kind = PyUnicode_KIND(self);
12100 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012101
Guido van Rossumd57fd912000-03-10 22:53:23 +000012102 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 if (length == 1)
12104 return PyBool_FromLong(
12105 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012107 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012108 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012109 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 for (i = 0; i < length; i++) {
12112 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012113 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012115 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116}
12117
Martin v. Löwis47383402007-08-15 07:32:56 +000012118int
12119PyUnicode_IsIdentifier(PyObject *self)
12120{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 int kind;
12122 void *data;
12123 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012124 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012126 if (PyUnicode_READY(self) == -1) {
12127 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012128 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 }
12130
12131 /* Special case for empty strings */
12132 if (PyUnicode_GET_LENGTH(self) == 0)
12133 return 0;
12134 kind = PyUnicode_KIND(self);
12135 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012136
12137 /* PEP 3131 says that the first character must be in
12138 XID_Start and subsequent characters in XID_Continue,
12139 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012140 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012141 letters, digits, underscore). However, given the current
12142 definition of XID_Start and XID_Continue, it is sufficient
12143 to check just for these, except that _ must be allowed
12144 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012146 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012147 return 0;
12148
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012149 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012150 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012151 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012152 return 1;
12153}
12154
INADA Naoki3ae20562017-01-16 20:41:20 +090012155/*[clinic input]
12156str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012157
INADA Naoki3ae20562017-01-16 20:41:20 +090012158Return True if the string is a valid Python identifier, False otherwise.
12159
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012160Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012161such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012162[clinic start generated code]*/
12163
12164static PyObject *
12165unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012166/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012167{
12168 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12169}
12170
INADA Naoki3ae20562017-01-16 20:41:20 +090012171/*[clinic input]
12172str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012173
INADA Naoki3ae20562017-01-16 20:41:20 +090012174Return True if the string is printable, False otherwise.
12175
12176A string is printable if all of its characters are considered printable in
12177repr() or if it is empty.
12178[clinic start generated code]*/
12179
12180static PyObject *
12181unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012182/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 Py_ssize_t i, length;
12185 int kind;
12186 void *data;
12187
12188 if (PyUnicode_READY(self) == -1)
12189 return NULL;
12190 length = PyUnicode_GET_LENGTH(self);
12191 kind = PyUnicode_KIND(self);
12192 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012193
12194 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 if (length == 1)
12196 return PyBool_FromLong(
12197 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012198
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 for (i = 0; i < length; i++) {
12200 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012201 Py_RETURN_FALSE;
12202 }
12203 }
12204 Py_RETURN_TRUE;
12205}
12206
INADA Naoki3ae20562017-01-16 20:41:20 +090012207/*[clinic input]
12208str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209
INADA Naoki3ae20562017-01-16 20:41:20 +090012210 iterable: object
12211 /
12212
12213Concatenate any number of strings.
12214
Martin Panter91a88662017-01-24 00:30:06 +000012215The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012216The result is returned as a new string.
12217
12218Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12219[clinic start generated code]*/
12220
12221static PyObject *
12222unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012223/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012224{
INADA Naoki3ae20562017-01-16 20:41:20 +090012225 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012226}
12227
Martin v. Löwis18e16552006-02-15 17:27:45 +000012228static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012229unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012231 if (PyUnicode_READY(self) == -1)
12232 return -1;
12233 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234}
12235
INADA Naoki3ae20562017-01-16 20:41:20 +090012236/*[clinic input]
12237str.ljust as unicode_ljust
12238
12239 width: Py_ssize_t
12240 fillchar: Py_UCS4 = ' '
12241 /
12242
12243Return a left-justified string of length width.
12244
12245Padding is done using the specified fill character (default is a space).
12246[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012247
12248static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012249unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12250/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012252 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012253 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012254
Victor Stinnerc4b49542011-12-11 22:44:26 +010012255 if (PyUnicode_GET_LENGTH(self) >= width)
12256 return unicode_result_unchanged(self);
12257
12258 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259}
12260
INADA Naoki3ae20562017-01-16 20:41:20 +090012261/*[clinic input]
12262str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263
INADA Naoki3ae20562017-01-16 20:41:20 +090012264Return a copy of the string converted to lowercase.
12265[clinic start generated code]*/
12266
12267static PyObject *
12268unicode_lower_impl(PyObject *self)
12269/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012270{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012271 if (PyUnicode_READY(self) == -1)
12272 return NULL;
12273 if (PyUnicode_IS_ASCII(self))
12274 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012275 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276}
12277
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012278#define LEFTSTRIP 0
12279#define RIGHTSTRIP 1
12280#define BOTHSTRIP 2
12281
12282/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012283static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284
INADA Naoki3ae20562017-01-16 20:41:20 +090012285#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012286
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012287/* externally visible for str.strip(unicode) */
12288PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012289_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012291 void *data;
12292 int kind;
12293 Py_ssize_t i, j, len;
12294 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012295 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12298 return NULL;
12299
12300 kind = PyUnicode_KIND(self);
12301 data = PyUnicode_DATA(self);
12302 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012303 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012304 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12305 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012306 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012307
Benjamin Peterson14339b62009-01-31 16:36:08 +000012308 i = 0;
12309 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012310 while (i < len) {
12311 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12312 if (!BLOOM(sepmask, ch))
12313 break;
12314 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12315 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012316 i++;
12317 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012318 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012319
Benjamin Peterson14339b62009-01-31 16:36:08 +000012320 j = len;
12321 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012322 j--;
12323 while (j >= i) {
12324 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12325 if (!BLOOM(sepmask, ch))
12326 break;
12327 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12328 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012329 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012330 }
12331
Benjamin Peterson29060642009-01-31 22:14:21 +000012332 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012333 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012334
Victor Stinner7931d9a2011-11-04 00:22:48 +010012335 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012336}
12337
12338PyObject*
12339PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12340{
12341 unsigned char *data;
12342 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012343 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344
Victor Stinnerde636f32011-10-01 03:55:54 +020012345 if (PyUnicode_READY(self) == -1)
12346 return NULL;
12347
Victor Stinner684d5fd2012-05-03 02:32:34 +020012348 length = PyUnicode_GET_LENGTH(self);
12349 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012350
Victor Stinner684d5fd2012-05-03 02:32:34 +020012351 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012352 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012353
Victor Stinnerde636f32011-10-01 03:55:54 +020012354 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012355 PyErr_SetString(PyExc_IndexError, "string index out of range");
12356 return NULL;
12357 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012358 if (start >= length || end < start)
12359 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012360
Victor Stinner684d5fd2012-05-03 02:32:34 +020012361 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012362 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012363 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012364 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012365 }
12366 else {
12367 kind = PyUnicode_KIND(self);
12368 data = PyUnicode_1BYTE_DATA(self);
12369 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012370 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012371 length);
12372 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012373}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012374
12375static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012376do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012377{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012378 Py_ssize_t len, i, j;
12379
12380 if (PyUnicode_READY(self) == -1)
12381 return NULL;
12382
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012383 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012384
Victor Stinnercc7af722013-04-09 22:39:24 +020012385 if (PyUnicode_IS_ASCII(self)) {
12386 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12387
12388 i = 0;
12389 if (striptype != RIGHTSTRIP) {
12390 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012391 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012392 if (!_Py_ascii_whitespace[ch])
12393 break;
12394 i++;
12395 }
12396 }
12397
12398 j = len;
12399 if (striptype != LEFTSTRIP) {
12400 j--;
12401 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012402 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012403 if (!_Py_ascii_whitespace[ch])
12404 break;
12405 j--;
12406 }
12407 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012408 }
12409 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012410 else {
12411 int kind = PyUnicode_KIND(self);
12412 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012413
Victor Stinnercc7af722013-04-09 22:39:24 +020012414 i = 0;
12415 if (striptype != RIGHTSTRIP) {
12416 while (i < len) {
12417 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12418 if (!Py_UNICODE_ISSPACE(ch))
12419 break;
12420 i++;
12421 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012422 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012423
12424 j = len;
12425 if (striptype != LEFTSTRIP) {
12426 j--;
12427 while (j >= i) {
12428 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12429 if (!Py_UNICODE_ISSPACE(ch))
12430 break;
12431 j--;
12432 }
12433 j++;
12434 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012435 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012436
Victor Stinner7931d9a2011-11-04 00:22:48 +010012437 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012438}
12439
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012440
12441static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012442do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012443{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012444 if (sep != NULL && sep != Py_None) {
12445 if (PyUnicode_Check(sep))
12446 return _PyUnicode_XStrip(self, striptype, sep);
12447 else {
12448 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012449 "%s arg must be None or str",
12450 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012451 return NULL;
12452 }
12453 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012456}
12457
12458
INADA Naoki3ae20562017-01-16 20:41:20 +090012459/*[clinic input]
12460str.strip as unicode_strip
12461
12462 chars: object = None
12463 /
12464
Victor Stinner0c4a8282017-01-17 02:21:47 +010012465Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012466
12467If chars is given and not None, remove characters in chars instead.
12468[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012469
12470static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012471unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012472/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012473{
INADA Naoki3ae20562017-01-16 20:41:20 +090012474 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012475}
12476
12477
INADA Naoki3ae20562017-01-16 20:41:20 +090012478/*[clinic input]
12479str.lstrip as unicode_lstrip
12480
12481 chars: object = NULL
12482 /
12483
12484Return a copy of the string with leading whitespace removed.
12485
12486If chars is given and not None, remove characters in chars instead.
12487[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012488
12489static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012490unicode_lstrip_impl(PyObject *self, PyObject *chars)
12491/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012492{
INADA Naoki3ae20562017-01-16 20:41:20 +090012493 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012494}
12495
12496
INADA Naoki3ae20562017-01-16 20:41:20 +090012497/*[clinic input]
12498str.rstrip as unicode_rstrip
12499
12500 chars: object = NULL
12501 /
12502
12503Return a copy of the string with trailing whitespace removed.
12504
12505If chars is given and not None, remove characters in chars instead.
12506[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012507
12508static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012509unicode_rstrip_impl(PyObject *self, PyObject *chars)
12510/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012511{
INADA Naoki3ae20562017-01-16 20:41:20 +090012512 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012513}
12514
12515
Guido van Rossumd57fd912000-03-10 22:53:23 +000012516static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012517unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012518{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012519 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012520 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012521
Serhiy Storchaka05997252013-01-26 12:14:02 +020012522 if (len < 1)
12523 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524
Victor Stinnerc4b49542011-12-11 22:44:26 +010012525 /* no repeat, return original string */
12526 if (len == 1)
12527 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012528
Benjamin Petersonbac79492012-01-14 13:34:47 -050012529 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012530 return NULL;
12531
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012532 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012533 PyErr_SetString(PyExc_OverflowError,
12534 "repeated string is too long");
12535 return NULL;
12536 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012537 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012538
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012539 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012540 if (!u)
12541 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012542 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 if (PyUnicode_GET_LENGTH(str) == 1) {
12545 const int kind = PyUnicode_KIND(str);
12546 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012547 if (kind == PyUnicode_1BYTE_KIND) {
12548 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012549 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012550 }
12551 else if (kind == PyUnicode_2BYTE_KIND) {
12552 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012553 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012554 ucs2[n] = fill_char;
12555 } else {
12556 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12557 assert(kind == PyUnicode_4BYTE_KIND);
12558 for (n = 0; n < len; ++n)
12559 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012560 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012561 }
12562 else {
12563 /* number of characters copied this far */
12564 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012565 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012566 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012567 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012568 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012569 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012571 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012572 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012574 }
12575
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012576 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012577 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578}
12579
Alexander Belopolsky40018472011-02-26 01:02:56 +000012580PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012581PyUnicode_Replace(PyObject *str,
12582 PyObject *substr,
12583 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012584 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012585{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012586 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12587 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012588 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012589 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012590}
12591
INADA Naoki3ae20562017-01-16 20:41:20 +090012592/*[clinic input]
12593str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594
INADA Naoki3ae20562017-01-16 20:41:20 +090012595 old: unicode
12596 new: unicode
12597 count: Py_ssize_t = -1
12598 Maximum number of occurrences to replace.
12599 -1 (the default value) means replace all occurrences.
12600 /
12601
12602Return a copy with all occurrences of substring old replaced by new.
12603
12604If the optional argument count is given, only the first count occurrences are
12605replaced.
12606[clinic start generated code]*/
12607
12608static PyObject *
12609unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12610 Py_ssize_t count)
12611/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012612{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012613 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012614 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012615 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616}
12617
Alexander Belopolsky40018472011-02-26 01:02:56 +000012618static PyObject *
12619unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012621 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012622 Py_ssize_t isize;
12623 Py_ssize_t osize, squote, dquote, i, o;
12624 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012625 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012629 return NULL;
12630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012631 isize = PyUnicode_GET_LENGTH(unicode);
12632 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 /* Compute length of output, quote characters, and
12635 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012636 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 max = 127;
12638 squote = dquote = 0;
12639 ikind = PyUnicode_KIND(unicode);
12640 for (i = 0; i < isize; i++) {
12641 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012642 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012644 case '\'': squote++; break;
12645 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012646 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012647 incr = 2;
12648 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 default:
12650 /* Fast-path ASCII */
12651 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012652 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012654 ;
12655 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012656 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012658 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012660 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012662 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012664 if (osize > PY_SSIZE_T_MAX - incr) {
12665 PyErr_SetString(PyExc_OverflowError,
12666 "string is too long to generate repr");
12667 return NULL;
12668 }
12669 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 }
12671
12672 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012673 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012675 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 if (dquote)
12677 /* Both squote and dquote present. Use squote,
12678 and escape them */
12679 osize += squote;
12680 else
12681 quote = '"';
12682 }
Victor Stinner55c08782013-04-14 18:45:39 +020012683 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684
12685 repr = PyUnicode_New(osize, max);
12686 if (repr == NULL)
12687 return NULL;
12688 okind = PyUnicode_KIND(repr);
12689 odata = PyUnicode_DATA(repr);
12690
12691 PyUnicode_WRITE(okind, odata, 0, quote);
12692 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012693 if (unchanged) {
12694 _PyUnicode_FastCopyCharacters(repr, 1,
12695 unicode, 0,
12696 isize);
12697 }
12698 else {
12699 for (i = 0, o = 1; i < isize; i++) {
12700 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012701
Victor Stinner55c08782013-04-14 18:45:39 +020012702 /* Escape quotes and backslashes */
12703 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012704 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012706 continue;
12707 }
12708
12709 /* Map special whitespace to '\t', \n', '\r' */
12710 if (ch == '\t') {
12711 PyUnicode_WRITE(okind, odata, o++, '\\');
12712 PyUnicode_WRITE(okind, odata, o++, 't');
12713 }
12714 else if (ch == '\n') {
12715 PyUnicode_WRITE(okind, odata, o++, '\\');
12716 PyUnicode_WRITE(okind, odata, o++, 'n');
12717 }
12718 else if (ch == '\r') {
12719 PyUnicode_WRITE(okind, odata, o++, '\\');
12720 PyUnicode_WRITE(okind, odata, o++, 'r');
12721 }
12722
12723 /* Map non-printable US ASCII to '\xhh' */
12724 else if (ch < ' ' || ch == 0x7F) {
12725 PyUnicode_WRITE(okind, odata, o++, '\\');
12726 PyUnicode_WRITE(okind, odata, o++, 'x');
12727 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12728 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12729 }
12730
12731 /* Copy ASCII characters as-is */
12732 else if (ch < 0x7F) {
12733 PyUnicode_WRITE(okind, odata, o++, ch);
12734 }
12735
12736 /* Non-ASCII characters */
12737 else {
12738 /* Map Unicode whitespace and control characters
12739 (categories Z* and C* except ASCII space)
12740 */
12741 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12742 PyUnicode_WRITE(okind, odata, o++, '\\');
12743 /* Map 8-bit characters to '\xhh' */
12744 if (ch <= 0xff) {
12745 PyUnicode_WRITE(okind, odata, o++, 'x');
12746 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12747 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12748 }
12749 /* Map 16-bit characters to '\uxxxx' */
12750 else if (ch <= 0xffff) {
12751 PyUnicode_WRITE(okind, odata, o++, 'u');
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12756 }
12757 /* Map 21-bit characters to '\U00xxxxxx' */
12758 else {
12759 PyUnicode_WRITE(okind, odata, o++, 'U');
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12763 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12764 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12768 }
12769 }
12770 /* Copy characters as-is */
12771 else {
12772 PyUnicode_WRITE(okind, odata, o++, ch);
12773 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012774 }
12775 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012776 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012778 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012779 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012780}
12781
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012782PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012783 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784\n\
12785Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012786such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012787arguments start and end are interpreted as in slice notation.\n\
12788\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012789Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790
12791static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012794 /* initialize variables to prevent gcc warning */
12795 PyObject *substring = NULL;
12796 Py_ssize_t start = 0;
12797 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012798 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012800 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012801 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012802
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012803 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012804 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012806 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808 if (result == -2)
12809 return NULL;
12810
Christian Heimes217cfd12007-12-02 14:31:20 +000012811 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012812}
12813
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012814PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012815 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012817Return the highest index in S where substring sub is found,\n\
12818such that sub is contained within S[start:end]. Optional\n\
12819arguments start and end are interpreted as in slice notation.\n\
12820\n\
12821Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822
12823static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012824unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012825{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012826 /* initialize variables to prevent gcc warning */
12827 PyObject *substring = NULL;
12828 Py_ssize_t start = 0;
12829 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012830 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012832 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012833 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012834
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012835 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012836 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012837
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012838 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012839
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 if (result == -2)
12841 return NULL;
12842
Guido van Rossumd57fd912000-03-10 22:53:23 +000012843 if (result < 0) {
12844 PyErr_SetString(PyExc_ValueError, "substring not found");
12845 return NULL;
12846 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847
Christian Heimes217cfd12007-12-02 14:31:20 +000012848 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849}
12850
INADA Naoki3ae20562017-01-16 20:41:20 +090012851/*[clinic input]
12852str.rjust as unicode_rjust
12853
12854 width: Py_ssize_t
12855 fillchar: Py_UCS4 = ' '
12856 /
12857
12858Return a right-justified string of length width.
12859
12860Padding is done using the specified fill character (default is a space).
12861[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012862
12863static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012864unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12865/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012867 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868 return NULL;
12869
Victor Stinnerc4b49542011-12-11 22:44:26 +010012870 if (PyUnicode_GET_LENGTH(self) >= width)
12871 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872
Victor Stinnerc4b49542011-12-11 22:44:26 +010012873 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874}
12875
Alexander Belopolsky40018472011-02-26 01:02:56 +000012876PyObject *
12877PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012879 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012880 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012881
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012882 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012883}
12884
INADA Naoki3ae20562017-01-16 20:41:20 +090012885/*[clinic input]
12886str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
INADA Naoki3ae20562017-01-16 20:41:20 +090012888 sep: object = None
12889 The delimiter according which to split the string.
12890 None (the default value) means split according to any whitespace,
12891 and discard empty strings from the result.
12892 maxsplit: Py_ssize_t = -1
12893 Maximum number of splits to do.
12894 -1 (the default value) means no limit.
12895
12896Return a list of the words in the string, using sep as the delimiter string.
12897[clinic start generated code]*/
12898
12899static PyObject *
12900unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12901/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012902{
INADA Naoki3ae20562017-01-16 20:41:20 +090012903 if (sep == Py_None)
12904 return split(self, NULL, maxsplit);
12905 if (PyUnicode_Check(sep))
12906 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012907
Victor Stinner998b8062018-09-12 00:23:25 +020012908 PyErr_Format(PyExc_TypeError,
12909 "must be str or None, not %.100s",
12910 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012911 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012912}
12913
Thomas Wouters477c8d52006-05-27 19:21:47 +000012914PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012915PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012916{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012917 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012918 int kind1, kind2;
12919 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012920 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012921
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012922 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012923 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924
Victor Stinner14f8f022011-10-05 20:58:25 +020012925 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012927 len1 = PyUnicode_GET_LENGTH(str_obj);
12928 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012929 if (kind1 < kind2 || len1 < len2) {
12930 _Py_INCREF_UNICODE_EMPTY();
12931 if (!unicode_empty)
12932 out = NULL;
12933 else {
12934 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12935 Py_DECREF(unicode_empty);
12936 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012937 return out;
12938 }
12939 buf1 = PyUnicode_DATA(str_obj);
12940 buf2 = PyUnicode_DATA(sep_obj);
12941 if (kind2 != kind1) {
12942 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12943 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012944 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012945 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012946
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012947 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012948 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012949 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12950 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12951 else
12952 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012953 break;
12954 case PyUnicode_2BYTE_KIND:
12955 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12956 break;
12957 case PyUnicode_4BYTE_KIND:
12958 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959 break;
12960 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012961 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012963
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012964 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012965 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012966
12967 return out;
12968}
12969
12970
12971PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012972PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012973{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012975 int kind1, kind2;
12976 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012977 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012978
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012979 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012980 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012982 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012984 len1 = PyUnicode_GET_LENGTH(str_obj);
12985 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012986 if (kind1 < kind2 || len1 < len2) {
12987 _Py_INCREF_UNICODE_EMPTY();
12988 if (!unicode_empty)
12989 out = NULL;
12990 else {
12991 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12992 Py_DECREF(unicode_empty);
12993 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012994 return out;
12995 }
12996 buf1 = PyUnicode_DATA(str_obj);
12997 buf2 = PyUnicode_DATA(sep_obj);
12998 if (kind2 != kind1) {
12999 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13000 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013001 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013002 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013003
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013004 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013005 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013006 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13007 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13008 else
13009 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013010 break;
13011 case PyUnicode_2BYTE_KIND:
13012 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13013 break;
13014 case PyUnicode_4BYTE_KIND:
13015 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13016 break;
13017 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013018 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013020
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013021 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013022 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013023
13024 return out;
13025}
13026
INADA Naoki3ae20562017-01-16 20:41:20 +090013027/*[clinic input]
13028str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013029
INADA Naoki3ae20562017-01-16 20:41:20 +090013030 sep: object
13031 /
13032
13033Partition the string into three parts using the given separator.
13034
13035This will search for the separator in the string. If the separator is found,
13036returns a 3-tuple containing the part before the separator, the separator
13037itself, and the part after it.
13038
13039If the separator is not found, returns a 3-tuple containing the original string
13040and two empty strings.
13041[clinic start generated code]*/
13042
13043static PyObject *
13044unicode_partition(PyObject *self, PyObject *sep)
13045/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013046{
INADA Naoki3ae20562017-01-16 20:41:20 +090013047 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013048}
13049
INADA Naoki3ae20562017-01-16 20:41:20 +090013050/*[clinic input]
13051str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052
INADA Naoki3ae20562017-01-16 20:41:20 +090013053Partition the string into three parts using the given separator.
13054
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013055This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013056the separator is found, returns a 3-tuple containing the part before the
13057separator, the separator itself, and the part after it.
13058
13059If the separator is not found, returns a 3-tuple containing two empty strings
13060and the original string.
13061[clinic start generated code]*/
13062
13063static PyObject *
13064unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013065/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013066{
INADA Naoki3ae20562017-01-16 20:41:20 +090013067 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013068}
13069
Alexander Belopolsky40018472011-02-26 01:02:56 +000013070PyObject *
13071PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013072{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013073 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013074 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013075
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013076 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013077}
13078
INADA Naoki3ae20562017-01-16 20:41:20 +090013079/*[clinic input]
13080str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013081
INADA Naoki3ae20562017-01-16 20:41:20 +090013082Return a list of the words in the string, using sep as the delimiter string.
13083
13084Splits are done starting at the end of the string and working to the front.
13085[clinic start generated code]*/
13086
13087static PyObject *
13088unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13089/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013090{
INADA Naoki3ae20562017-01-16 20:41:20 +090013091 if (sep == Py_None)
13092 return rsplit(self, NULL, maxsplit);
13093 if (PyUnicode_Check(sep))
13094 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013095
Victor Stinner998b8062018-09-12 00:23:25 +020013096 PyErr_Format(PyExc_TypeError,
13097 "must be str or None, not %.100s",
13098 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013099 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013100}
13101
INADA Naoki3ae20562017-01-16 20:41:20 +090013102/*[clinic input]
13103str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013104
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013105 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013106
13107Return a list of the lines in the string, breaking at line boundaries.
13108
13109Line breaks are not included in the resulting list unless keepends is given and
13110true.
13111[clinic start generated code]*/
13112
13113static PyObject *
13114unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013115/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013116{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013117 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118}
13119
13120static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013121PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013123 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124}
13125
INADA Naoki3ae20562017-01-16 20:41:20 +090013126/*[clinic input]
13127str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128
INADA Naoki3ae20562017-01-16 20:41:20 +090013129Convert uppercase characters to lowercase and lowercase characters to uppercase.
13130[clinic start generated code]*/
13131
13132static PyObject *
13133unicode_swapcase_impl(PyObject *self)
13134/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013135{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013136 if (PyUnicode_READY(self) == -1)
13137 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013138 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139}
13140
Larry Hastings61272b72014-01-07 12:41:53 -080013141/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013142
Larry Hastings31826802013-10-19 00:09:25 -070013143@staticmethod
13144str.maketrans as unicode_maketrans
13145
13146 x: object
13147
13148 y: unicode=NULL
13149
13150 z: unicode=NULL
13151
13152 /
13153
13154Return a translation table usable for str.translate().
13155
13156If there is only one argument, it must be a dictionary mapping Unicode
13157ordinals (integers) or characters to Unicode ordinals, strings or None.
13158Character keys will be then converted to ordinals.
13159If there are two arguments, they must be strings of equal length, and
13160in the resulting dictionary, each character in x will be mapped to the
13161character at the same position in y. If there is a third argument, it
13162must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013163[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013164
Larry Hastings31826802013-10-19 00:09:25 -070013165static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013166unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013167/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013168{
Georg Brandlceee0772007-11-27 23:48:05 +000013169 PyObject *new = NULL, *key, *value;
13170 Py_ssize_t i = 0;
13171 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013172
Georg Brandlceee0772007-11-27 23:48:05 +000013173 new = PyDict_New();
13174 if (!new)
13175 return NULL;
13176 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013177 int x_kind, y_kind, z_kind;
13178 void *x_data, *y_data, *z_data;
13179
Georg Brandlceee0772007-11-27 23:48:05 +000013180 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013181 if (!PyUnicode_Check(x)) {
13182 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13183 "be a string if there is a second argument");
13184 goto err;
13185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013186 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013187 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13188 "arguments must have equal length");
13189 goto err;
13190 }
13191 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 x_kind = PyUnicode_KIND(x);
13193 y_kind = PyUnicode_KIND(y);
13194 x_data = PyUnicode_DATA(x);
13195 y_data = PyUnicode_DATA(y);
13196 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13197 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013198 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013199 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013200 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013201 if (!value) {
13202 Py_DECREF(key);
13203 goto err;
13204 }
Georg Brandlceee0772007-11-27 23:48:05 +000013205 res = PyDict_SetItem(new, key, value);
13206 Py_DECREF(key);
13207 Py_DECREF(value);
13208 if (res < 0)
13209 goto err;
13210 }
13211 /* create entries for deleting chars in z */
13212 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013213 z_kind = PyUnicode_KIND(z);
13214 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013215 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013216 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013217 if (!key)
13218 goto err;
13219 res = PyDict_SetItem(new, key, Py_None);
13220 Py_DECREF(key);
13221 if (res < 0)
13222 goto err;
13223 }
13224 }
13225 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013226 int kind;
13227 void *data;
13228
Georg Brandlceee0772007-11-27 23:48:05 +000013229 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013230 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013231 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13232 "to maketrans it must be a dict");
13233 goto err;
13234 }
13235 /* copy entries into the new dict, converting string keys to int keys */
13236 while (PyDict_Next(x, &i, &key, &value)) {
13237 if (PyUnicode_Check(key)) {
13238 /* convert string keys to integer keys */
13239 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013240 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013241 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13242 "table must be of length 1");
13243 goto err;
13244 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013245 kind = PyUnicode_KIND(key);
13246 data = PyUnicode_DATA(key);
13247 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013248 if (!newkey)
13249 goto err;
13250 res = PyDict_SetItem(new, newkey, value);
13251 Py_DECREF(newkey);
13252 if (res < 0)
13253 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013254 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013255 /* just keep integer keys */
13256 if (PyDict_SetItem(new, key, value) < 0)
13257 goto err;
13258 } else {
13259 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13260 "be strings or integers");
13261 goto err;
13262 }
13263 }
13264 }
13265 return new;
13266 err:
13267 Py_DECREF(new);
13268 return NULL;
13269}
13270
INADA Naoki3ae20562017-01-16 20:41:20 +090013271/*[clinic input]
13272str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013273
INADA Naoki3ae20562017-01-16 20:41:20 +090013274 table: object
13275 Translation table, which must be a mapping of Unicode ordinals to
13276 Unicode ordinals, strings, or None.
13277 /
13278
13279Replace each character in the string using the given translation table.
13280
13281The table must implement lookup/indexing via __getitem__, for instance a
13282dictionary or list. If this operation raises LookupError, the character is
13283left untouched. Characters mapped to None are deleted.
13284[clinic start generated code]*/
13285
13286static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013287unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013288/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013289{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013290 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013291}
13292
INADA Naoki3ae20562017-01-16 20:41:20 +090013293/*[clinic input]
13294str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295
INADA Naoki3ae20562017-01-16 20:41:20 +090013296Return a copy of the string converted to uppercase.
13297[clinic start generated code]*/
13298
13299static PyObject *
13300unicode_upper_impl(PyObject *self)
13301/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013302{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013303 if (PyUnicode_READY(self) == -1)
13304 return NULL;
13305 if (PyUnicode_IS_ASCII(self))
13306 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013307 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308}
13309
INADA Naoki3ae20562017-01-16 20:41:20 +090013310/*[clinic input]
13311str.zfill as unicode_zfill
13312
13313 width: Py_ssize_t
13314 /
13315
13316Pad a numeric string with zeros on the left, to fill a field of the given width.
13317
13318The string is never truncated.
13319[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013320
13321static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013322unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013323/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013325 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013326 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013327 int kind;
13328 void *data;
13329 Py_UCS4 chr;
13330
Benjamin Petersonbac79492012-01-14 13:34:47 -050013331 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013333
Victor Stinnerc4b49542011-12-11 22:44:26 +010013334 if (PyUnicode_GET_LENGTH(self) >= width)
13335 return unicode_result_unchanged(self);
13336
13337 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338
13339 u = pad(self, fill, 0, '0');
13340
Walter Dörwald068325e2002-04-15 13:36:47 +000013341 if (u == NULL)
13342 return NULL;
13343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013344 kind = PyUnicode_KIND(u);
13345 data = PyUnicode_DATA(u);
13346 chr = PyUnicode_READ(kind, data, fill);
13347
13348 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013349 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 PyUnicode_WRITE(kind, data, 0, chr);
13351 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352 }
13353
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013354 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013355 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357
13358#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013359static PyObject *
13360unicode__decimal2ascii(PyObject *self)
13361{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013363}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364#endif
13365
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013366PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013367 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013369Return True if S starts with the specified prefix, False otherwise.\n\
13370With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013371With optional end, stop comparing S at that position.\n\
13372prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013373
13374static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013375unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013376 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013378 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013379 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013380 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013381 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383
Jesus Ceaac451502011-04-20 17:09:23 +020013384 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013385 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386 if (PyTuple_Check(subobj)) {
13387 Py_ssize_t i;
13388 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013389 substring = PyTuple_GET_ITEM(subobj, i);
13390 if (!PyUnicode_Check(substring)) {
13391 PyErr_Format(PyExc_TypeError,
13392 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013393 "not %.100s",
13394 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013395 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013396 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013397 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013398 if (result == -1)
13399 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400 if (result) {
13401 Py_RETURN_TRUE;
13402 }
13403 }
13404 /* nothing matched */
13405 Py_RETURN_FALSE;
13406 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013407 if (!PyUnicode_Check(subobj)) {
13408 PyErr_Format(PyExc_TypeError,
13409 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013410 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013411 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013412 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013413 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013414 if (result == -1)
13415 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013416 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013417}
13418
13419
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013420PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013421 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013422\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013423Return True if S ends with the specified suffix, False otherwise.\n\
13424With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013425With optional end, stop comparing S at that position.\n\
13426suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013427
13428static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013429unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013430 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013432 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013433 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013434 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013435 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013436 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437
Jesus Ceaac451502011-04-20 17:09:23 +020013438 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013439 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013440 if (PyTuple_Check(subobj)) {
13441 Py_ssize_t i;
13442 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013443 substring = PyTuple_GET_ITEM(subobj, i);
13444 if (!PyUnicode_Check(substring)) {
13445 PyErr_Format(PyExc_TypeError,
13446 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013447 "not %.100s",
13448 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013449 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013450 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013451 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013452 if (result == -1)
13453 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013454 if (result) {
13455 Py_RETURN_TRUE;
13456 }
13457 }
13458 Py_RETURN_FALSE;
13459 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013460 if (!PyUnicode_Check(subobj)) {
13461 PyErr_Format(PyExc_TypeError,
13462 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013463 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013464 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013465 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013466 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013467 if (result == -1)
13468 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013469 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013470}
13471
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013472static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013473_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013474{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013475 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13476 writer->data = PyUnicode_DATA(writer->buffer);
13477
13478 if (!writer->readonly) {
13479 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013480 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013481 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013482 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013483 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13484 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13485 writer->kind = PyUnicode_WCHAR_KIND;
13486 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13487
Victor Stinner8f674cc2013-04-17 23:02:17 +020013488 /* Copy-on-write mode: set buffer size to 0 so
13489 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13490 * next write. */
13491 writer->size = 0;
13492 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013493}
13494
Victor Stinnerd3f08822012-05-29 12:57:52 +020013495void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013496_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013497{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013498 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013499
13500 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013501 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013502
13503 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13504 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13505 writer->kind = PyUnicode_WCHAR_KIND;
13506 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013507}
13508
Victor Stinnerd3f08822012-05-29 12:57:52 +020013509int
13510_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13511 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013512{
13513 Py_ssize_t newlen;
13514 PyObject *newbuffer;
13515
Victor Stinner2740e462016-09-06 16:58:36 -070013516 assert(maxchar <= MAX_UNICODE);
13517
Victor Stinnerca9381e2015-09-22 00:58:32 +020013518 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013519 assert((maxchar > writer->maxchar && length >= 0)
13520 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013521
Victor Stinner202fdca2012-05-07 12:47:02 +020013522 if (length > PY_SSIZE_T_MAX - writer->pos) {
13523 PyErr_NoMemory();
13524 return -1;
13525 }
13526 newlen = writer->pos + length;
13527
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013528 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013529
Victor Stinnerd3f08822012-05-29 12:57:52 +020013530 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013531 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013532 if (writer->overallocate
13533 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13534 /* overallocate to limit the number of realloc() */
13535 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013537 if (newlen < writer->min_length)
13538 newlen = writer->min_length;
13539
Victor Stinnerd3f08822012-05-29 12:57:52 +020013540 writer->buffer = PyUnicode_New(newlen, maxchar);
13541 if (writer->buffer == NULL)
13542 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013543 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013544 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013545 if (writer->overallocate
13546 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13547 /* overallocate to limit the number of realloc() */
13548 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013549 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013550 if (newlen < writer->min_length)
13551 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013552
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013553 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013554 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013555 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013556 newbuffer = PyUnicode_New(newlen, maxchar);
13557 if (newbuffer == NULL)
13558 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013559 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13560 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013561 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013562 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013563 }
13564 else {
13565 newbuffer = resize_compact(writer->buffer, newlen);
13566 if (newbuffer == NULL)
13567 return -1;
13568 }
13569 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013570 }
13571 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013572 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013573 newbuffer = PyUnicode_New(writer->size, maxchar);
13574 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013575 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013576 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13577 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013578 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013579 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013580 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013581 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013582
13583#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013584}
13585
Victor Stinnerca9381e2015-09-22 00:58:32 +020013586int
13587_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13588 enum PyUnicode_Kind kind)
13589{
13590 Py_UCS4 maxchar;
13591
13592 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13593 assert(writer->kind < kind);
13594
13595 switch (kind)
13596 {
13597 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13598 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13599 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13600 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013601 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013602 }
13603
13604 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13605}
13606
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013607static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013608_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013609{
Victor Stinner2740e462016-09-06 16:58:36 -070013610 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013611 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13612 return -1;
13613 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13614 writer->pos++;
13615 return 0;
13616}
13617
13618int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013619_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13620{
13621 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13622}
13623
13624int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013625_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13626{
13627 Py_UCS4 maxchar;
13628 Py_ssize_t len;
13629
13630 if (PyUnicode_READY(str) == -1)
13631 return -1;
13632 len = PyUnicode_GET_LENGTH(str);
13633 if (len == 0)
13634 return 0;
13635 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13636 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013637 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013638 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013639 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013640 Py_INCREF(str);
13641 writer->buffer = str;
13642 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013643 writer->pos += len;
13644 return 0;
13645 }
13646 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13647 return -1;
13648 }
13649 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13650 str, 0, len);
13651 writer->pos += len;
13652 return 0;
13653}
13654
Victor Stinnere215d962012-10-06 23:03:36 +020013655int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013656_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13657 Py_ssize_t start, Py_ssize_t end)
13658{
13659 Py_UCS4 maxchar;
13660 Py_ssize_t len;
13661
13662 if (PyUnicode_READY(str) == -1)
13663 return -1;
13664
13665 assert(0 <= start);
13666 assert(end <= PyUnicode_GET_LENGTH(str));
13667 assert(start <= end);
13668
13669 if (end == 0)
13670 return 0;
13671
13672 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13673 return _PyUnicodeWriter_WriteStr(writer, str);
13674
13675 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13676 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13677 else
13678 maxchar = writer->maxchar;
13679 len = end - start;
13680
13681 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13682 return -1;
13683
13684 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13685 str, start, len);
13686 writer->pos += len;
13687 return 0;
13688}
13689
13690int
Victor Stinner4a587072013-11-19 12:54:53 +010013691_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13692 const char *ascii, Py_ssize_t len)
13693{
13694 if (len == -1)
13695 len = strlen(ascii);
13696
13697 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13698
13699 if (writer->buffer == NULL && !writer->overallocate) {
13700 PyObject *str;
13701
13702 str = _PyUnicode_FromASCII(ascii, len);
13703 if (str == NULL)
13704 return -1;
13705
13706 writer->readonly = 1;
13707 writer->buffer = str;
13708 _PyUnicodeWriter_Update(writer);
13709 writer->pos += len;
13710 return 0;
13711 }
13712
13713 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13714 return -1;
13715
13716 switch (writer->kind)
13717 {
13718 case PyUnicode_1BYTE_KIND:
13719 {
13720 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13721 Py_UCS1 *data = writer->data;
13722
Christian Heimesf051e432016-09-13 20:22:02 +020013723 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013724 break;
13725 }
13726 case PyUnicode_2BYTE_KIND:
13727 {
13728 _PyUnicode_CONVERT_BYTES(
13729 Py_UCS1, Py_UCS2,
13730 ascii, ascii + len,
13731 (Py_UCS2 *)writer->data + writer->pos);
13732 break;
13733 }
13734 case PyUnicode_4BYTE_KIND:
13735 {
13736 _PyUnicode_CONVERT_BYTES(
13737 Py_UCS1, Py_UCS4,
13738 ascii, ascii + len,
13739 (Py_UCS4 *)writer->data + writer->pos);
13740 break;
13741 }
13742 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013743 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013744 }
13745
13746 writer->pos += len;
13747 return 0;
13748}
13749
13750int
13751_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13752 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013753{
13754 Py_UCS4 maxchar;
13755
13756 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13757 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13758 return -1;
13759 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13760 writer->pos += len;
13761 return 0;
13762}
13763
Victor Stinnerd3f08822012-05-29 12:57:52 +020013764PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013765_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013766{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013767 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013768
Victor Stinnerd3f08822012-05-29 12:57:52 +020013769 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013770 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013771 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013772 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013773
13774 str = writer->buffer;
13775 writer->buffer = NULL;
13776
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013777 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013778 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13779 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013780 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013781
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013782 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13783 PyObject *str2;
13784 str2 = resize_compact(str, writer->pos);
13785 if (str2 == NULL) {
13786 Py_DECREF(str);
13787 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013788 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013789 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013790 }
13791
Victor Stinner15a0bd32013-07-08 22:29:55 +020013792 assert(_PyUnicode_CheckConsistency(str, 1));
13793 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013794}
13795
Victor Stinnerd3f08822012-05-29 12:57:52 +020013796void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013797_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013798{
13799 Py_CLEAR(writer->buffer);
13800}
13801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013802#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013803
13804PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013805 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013806\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013807Return a formatted version of S, using substitutions from args and kwargs.\n\
13808The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013809
Eric Smith27bbca62010-11-04 17:06:58 +000013810PyDoc_STRVAR(format_map__doc__,
13811 "S.format_map(mapping) -> str\n\
13812\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013813Return a formatted version of S, using substitutions from mapping.\n\
13814The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013815
INADA Naoki3ae20562017-01-16 20:41:20 +090013816/*[clinic input]
13817str.__format__ as unicode___format__
13818
13819 format_spec: unicode
13820 /
13821
13822Return a formatted version of the string as described by format_spec.
13823[clinic start generated code]*/
13824
Eric Smith4a7d76d2008-05-30 18:10:19 +000013825static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013826unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013827/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013828{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013829 _PyUnicodeWriter writer;
13830 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013831
Victor Stinnerd3f08822012-05-29 12:57:52 +020013832 if (PyUnicode_READY(self) == -1)
13833 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013834 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013835 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13836 self, format_spec, 0,
13837 PyUnicode_GET_LENGTH(format_spec));
13838 if (ret == -1) {
13839 _PyUnicodeWriter_Dealloc(&writer);
13840 return NULL;
13841 }
13842 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013843}
13844
INADA Naoki3ae20562017-01-16 20:41:20 +090013845/*[clinic input]
13846str.__sizeof__ as unicode_sizeof
13847
13848Return the size of the string in memory, in bytes.
13849[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013850
13851static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013852unicode_sizeof_impl(PyObject *self)
13853/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013854{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013855 Py_ssize_t size;
13856
13857 /* If it's a compact object, account for base structure +
13858 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013859 if (PyUnicode_IS_COMPACT_ASCII(self))
13860 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13861 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013862 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013863 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013864 else {
13865 /* If it is a two-block object, account for base object, and
13866 for character block if present. */
13867 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013868 if (_PyUnicode_DATA_ANY(self))
13869 size += (PyUnicode_GET_LENGTH(self) + 1) *
13870 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013871 }
13872 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013873 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013874 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13875 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13876 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13877 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013878
13879 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013880}
13881
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013882static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013883unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013884{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013885 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013886 if (!copy)
13887 return NULL;
13888 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013889}
13890
Guido van Rossumd57fd912000-03-10 22:53:23 +000013891static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013892 UNICODE_ENCODE_METHODDEF
13893 UNICODE_REPLACE_METHODDEF
13894 UNICODE_SPLIT_METHODDEF
13895 UNICODE_RSPLIT_METHODDEF
13896 UNICODE_JOIN_METHODDEF
13897 UNICODE_CAPITALIZE_METHODDEF
13898 UNICODE_CASEFOLD_METHODDEF
13899 UNICODE_TITLE_METHODDEF
13900 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013901 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013902 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013903 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013904 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013905 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013906 UNICODE_LJUST_METHODDEF
13907 UNICODE_LOWER_METHODDEF
13908 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013909 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13910 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013911 UNICODE_RJUST_METHODDEF
13912 UNICODE_RSTRIP_METHODDEF
13913 UNICODE_RPARTITION_METHODDEF
13914 UNICODE_SPLITLINES_METHODDEF
13915 UNICODE_STRIP_METHODDEF
13916 UNICODE_SWAPCASE_METHODDEF
13917 UNICODE_TRANSLATE_METHODDEF
13918 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013919 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13920 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013921 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013922 UNICODE_ISLOWER_METHODDEF
13923 UNICODE_ISUPPER_METHODDEF
13924 UNICODE_ISTITLE_METHODDEF
13925 UNICODE_ISSPACE_METHODDEF
13926 UNICODE_ISDECIMAL_METHODDEF
13927 UNICODE_ISDIGIT_METHODDEF
13928 UNICODE_ISNUMERIC_METHODDEF
13929 UNICODE_ISALPHA_METHODDEF
13930 UNICODE_ISALNUM_METHODDEF
13931 UNICODE_ISIDENTIFIER_METHODDEF
13932 UNICODE_ISPRINTABLE_METHODDEF
13933 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013934 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013935 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013936 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013937 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013938 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013939#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013940 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013941 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013942#endif
13943
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013944 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013945 {NULL, NULL}
13946};
13947
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013948static PyObject *
13949unicode_mod(PyObject *v, PyObject *w)
13950{
Brian Curtindfc80e32011-08-10 20:28:54 -050013951 if (!PyUnicode_Check(v))
13952 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013953 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013954}
13955
13956static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013957 0, /*nb_add*/
13958 0, /*nb_subtract*/
13959 0, /*nb_multiply*/
13960 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013961};
13962
Guido van Rossumd57fd912000-03-10 22:53:23 +000013963static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013964 (lenfunc) unicode_length, /* sq_length */
13965 PyUnicode_Concat, /* sq_concat */
13966 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13967 (ssizeargfunc) unicode_getitem, /* sq_item */
13968 0, /* sq_slice */
13969 0, /* sq_ass_item */
13970 0, /* sq_ass_slice */
13971 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013972};
13973
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013974static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013975unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013977 if (PyUnicode_READY(self) == -1)
13978 return NULL;
13979
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013980 if (PyIndex_Check(item)) {
13981 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013982 if (i == -1 && PyErr_Occurred())
13983 return NULL;
13984 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013985 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013986 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013987 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060013988 Py_ssize_t start, stop, step, slicelength, i;
13989 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013990 PyObject *result;
13991 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013992 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013993 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013994
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013995 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996 return NULL;
13997 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013998 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
13999 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014000
14001 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014002 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014003 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014004 slicelength == PyUnicode_GET_LENGTH(self)) {
14005 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014006 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014007 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014008 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014009 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014010 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014011 src_kind = PyUnicode_KIND(self);
14012 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014013 if (!PyUnicode_IS_ASCII(self)) {
14014 kind_limit = kind_maxchar_limit(src_kind);
14015 max_char = 0;
14016 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14017 ch = PyUnicode_READ(src_kind, src_data, cur);
14018 if (ch > max_char) {
14019 max_char = ch;
14020 if (max_char >= kind_limit)
14021 break;
14022 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014023 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014024 }
Victor Stinner55c99112011-10-13 01:17:06 +020014025 else
14026 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014027 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014028 if (result == NULL)
14029 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014030 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014031 dest_data = PyUnicode_DATA(result);
14032
14033 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014034 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14035 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014036 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014037 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014038 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014039 } else {
14040 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14041 return NULL;
14042 }
14043}
14044
14045static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014046 (lenfunc)unicode_length, /* mp_length */
14047 (binaryfunc)unicode_subscript, /* mp_subscript */
14048 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014049};
14050
Guido van Rossumd57fd912000-03-10 22:53:23 +000014051
Guido van Rossumd57fd912000-03-10 22:53:23 +000014052/* Helpers for PyUnicode_Format() */
14053
Victor Stinnera47082312012-10-04 02:19:54 +020014054struct unicode_formatter_t {
14055 PyObject *args;
14056 int args_owned;
14057 Py_ssize_t arglen, argidx;
14058 PyObject *dict;
14059
14060 enum PyUnicode_Kind fmtkind;
14061 Py_ssize_t fmtcnt, fmtpos;
14062 void *fmtdata;
14063 PyObject *fmtstr;
14064
14065 _PyUnicodeWriter writer;
14066};
14067
14068struct unicode_format_arg_t {
14069 Py_UCS4 ch;
14070 int flags;
14071 Py_ssize_t width;
14072 int prec;
14073 int sign;
14074};
14075
Guido van Rossumd57fd912000-03-10 22:53:23 +000014076static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014077unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014078{
Victor Stinnera47082312012-10-04 02:19:54 +020014079 Py_ssize_t argidx = ctx->argidx;
14080
14081 if (argidx < ctx->arglen) {
14082 ctx->argidx++;
14083 if (ctx->arglen < 0)
14084 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014085 else
Victor Stinnera47082312012-10-04 02:19:54 +020014086 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014087 }
14088 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014089 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090 return NULL;
14091}
14092
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014093/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094
Victor Stinnera47082312012-10-04 02:19:54 +020014095/* Format a float into the writer if the writer is not NULL, or into *p_output
14096 otherwise.
14097
14098 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014099static int
Victor Stinnera47082312012-10-04 02:19:54 +020014100formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14101 PyObject **p_output,
14102 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014103{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014104 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014105 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014106 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014107 int prec;
14108 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014109
Guido van Rossumd57fd912000-03-10 22:53:23 +000014110 x = PyFloat_AsDouble(v);
14111 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014112 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014113
Victor Stinnera47082312012-10-04 02:19:54 +020014114 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014115 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014116 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014117
Victor Stinnera47082312012-10-04 02:19:54 +020014118 if (arg->flags & F_ALT)
14119 dtoa_flags = Py_DTSF_ALT;
14120 else
14121 dtoa_flags = 0;
14122 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014123 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014124 return -1;
14125 len = strlen(p);
14126 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014127 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014128 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014129 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014130 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014131 }
14132 else
14133 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014134 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014135 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014136}
14137
Victor Stinnerd0880d52012-04-27 23:40:13 +020014138/* formatlong() emulates the format codes d, u, o, x and X, and
14139 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14140 * Python's regular ints.
14141 * Return value: a new PyUnicodeObject*, or NULL if error.
14142 * The output string is of the form
14143 * "-"? ("0x" | "0X")? digit+
14144 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14145 * set in flags. The case of hex digits will be correct,
14146 * There will be at least prec digits, zero-filled on the left if
14147 * necessary to get that many.
14148 * val object to be converted
14149 * flags bitmask of format flags; only F_ALT is looked at
14150 * prec minimum number of digits; 0-fill on left if needed
14151 * type a character in [duoxX]; u acts the same as d
14152 *
14153 * CAUTION: o, x and X conversions on regular ints can never
14154 * produce a '-' sign, but can for Python's unbounded ints.
14155 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014156PyObject *
14157_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014158{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014159 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014160 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014161 Py_ssize_t i;
14162 int sign; /* 1 if '-', else 0 */
14163 int len; /* number of characters */
14164 Py_ssize_t llen;
14165 int numdigits; /* len == numnondigits + numdigits */
14166 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014167
Victor Stinnerd0880d52012-04-27 23:40:13 +020014168 /* Avoid exceeding SSIZE_T_MAX */
14169 if (prec > INT_MAX-3) {
14170 PyErr_SetString(PyExc_OverflowError,
14171 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014172 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014173 }
14174
14175 assert(PyLong_Check(val));
14176
14177 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014178 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014179 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014180 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014181 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014182 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014183 /* int and int subclasses should print numerically when a numeric */
14184 /* format code is used (see issue18780) */
14185 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014186 break;
14187 case 'o':
14188 numnondigits = 2;
14189 result = PyNumber_ToBase(val, 8);
14190 break;
14191 case 'x':
14192 case 'X':
14193 numnondigits = 2;
14194 result = PyNumber_ToBase(val, 16);
14195 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014196 }
14197 if (!result)
14198 return NULL;
14199
14200 assert(unicode_modifiable(result));
14201 assert(PyUnicode_IS_READY(result));
14202 assert(PyUnicode_IS_ASCII(result));
14203
14204 /* To modify the string in-place, there can only be one reference. */
14205 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014206 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014207 PyErr_BadInternalCall();
14208 return NULL;
14209 }
14210 buf = PyUnicode_DATA(result);
14211 llen = PyUnicode_GET_LENGTH(result);
14212 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014213 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014214 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014215 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014216 return NULL;
14217 }
14218 len = (int)llen;
14219 sign = buf[0] == '-';
14220 numnondigits += sign;
14221 numdigits = len - numnondigits;
14222 assert(numdigits > 0);
14223
14224 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014225 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014226 (type == 'o' || type == 'x' || type == 'X'))) {
14227 assert(buf[sign] == '0');
14228 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14229 buf[sign+1] == 'o');
14230 numnondigits -= 2;
14231 buf += 2;
14232 len -= 2;
14233 if (sign)
14234 buf[0] = '-';
14235 assert(len == numnondigits + numdigits);
14236 assert(numdigits > 0);
14237 }
14238
14239 /* Fill with leading zeroes to meet minimum width. */
14240 if (prec > numdigits) {
14241 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14242 numnondigits + prec);
14243 char *b1;
14244 if (!r1) {
14245 Py_DECREF(result);
14246 return NULL;
14247 }
14248 b1 = PyBytes_AS_STRING(r1);
14249 for (i = 0; i < numnondigits; ++i)
14250 *b1++ = *buf++;
14251 for (i = 0; i < prec - numdigits; i++)
14252 *b1++ = '0';
14253 for (i = 0; i < numdigits; i++)
14254 *b1++ = *buf++;
14255 *b1 = '\0';
14256 Py_DECREF(result);
14257 result = r1;
14258 buf = PyBytes_AS_STRING(result);
14259 len = numnondigits + prec;
14260 }
14261
14262 /* Fix up case for hex conversions. */
14263 if (type == 'X') {
14264 /* Need to convert all lower case letters to upper case.
14265 and need to convert 0x to 0X (and -0x to -0X). */
14266 for (i = 0; i < len; i++)
14267 if (buf[i] >= 'a' && buf[i] <= 'x')
14268 buf[i] -= 'a'-'A';
14269 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014270 if (!PyUnicode_Check(result)
14271 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014272 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014273 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014274 Py_DECREF(result);
14275 result = unicode;
14276 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014277 else if (len != PyUnicode_GET_LENGTH(result)) {
14278 if (PyUnicode_Resize(&result, len) < 0)
14279 Py_CLEAR(result);
14280 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014281 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014282}
14283
Ethan Furmandf3ed242014-01-05 06:50:30 -080014284/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014285 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014286 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014287 * -1 and raise an exception on error */
14288static int
Victor Stinnera47082312012-10-04 02:19:54 +020014289mainformatlong(PyObject *v,
14290 struct unicode_format_arg_t *arg,
14291 PyObject **p_output,
14292 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014293{
14294 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014295 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014296
14297 if (!PyNumber_Check(v))
14298 goto wrongtype;
14299
Ethan Furman9ab74802014-03-21 06:38:46 -070014300 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014302 if (type == 'o' || type == 'x' || type == 'X') {
14303 iobj = PyNumber_Index(v);
14304 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014305 if (PyErr_ExceptionMatches(PyExc_TypeError))
14306 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014307 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014308 }
14309 }
14310 else {
14311 iobj = PyNumber_Long(v);
14312 if (iobj == NULL ) {
14313 if (PyErr_ExceptionMatches(PyExc_TypeError))
14314 goto wrongtype;
14315 return -1;
14316 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014317 }
14318 assert(PyLong_Check(iobj));
14319 }
14320 else {
14321 iobj = v;
14322 Py_INCREF(iobj);
14323 }
14324
14325 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014326 && arg->width == -1 && arg->prec == -1
14327 && !(arg->flags & (F_SIGN | F_BLANK))
14328 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014329 {
14330 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014331 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014332 int base;
14333
Victor Stinnera47082312012-10-04 02:19:54 +020014334 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014335 {
14336 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014337 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014338 case 'd':
14339 case 'i':
14340 case 'u':
14341 base = 10;
14342 break;
14343 case 'o':
14344 base = 8;
14345 break;
14346 case 'x':
14347 case 'X':
14348 base = 16;
14349 break;
14350 }
14351
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014352 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14353 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014354 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014355 }
14356 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014357 return 1;
14358 }
14359
Ethan Furmanb95b5612015-01-23 20:05:18 -080014360 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014361 Py_DECREF(iobj);
14362 if (res == NULL)
14363 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014364 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014365 return 0;
14366
14367wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014368 switch(type)
14369 {
14370 case 'o':
14371 case 'x':
14372 case 'X':
14373 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014374 "%%%c format: an integer is required, "
14375 "not %.200s",
14376 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014377 break;
14378 default:
14379 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014380 "%%%c format: a number is required, "
14381 "not %.200s",
14382 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014383 break;
14384 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014385 return -1;
14386}
14387
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014388static Py_UCS4
14389formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014390{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014391 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014392 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014393 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014394 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014395 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014396 goto onError;
14397 }
14398 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014399 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014401 /* make sure number is a type of integer */
14402 if (!PyLong_Check(v)) {
14403 iobj = PyNumber_Index(v);
14404 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014405 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014406 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014407 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014408 Py_DECREF(iobj);
14409 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014410 else {
14411 x = PyLong_AsLong(v);
14412 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014413 if (x == -1 && PyErr_Occurred())
14414 goto onError;
14415
Victor Stinner8faf8212011-12-08 22:14:11 +010014416 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 PyErr_SetString(PyExc_OverflowError,
14418 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014419 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014420 }
14421
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014422 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014423 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014424
Benjamin Peterson29060642009-01-31 22:14:21 +000014425 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014426 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014427 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014428 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014429}
14430
Victor Stinnera47082312012-10-04 02:19:54 +020014431/* Parse options of an argument: flags, width, precision.
14432 Handle also "%(name)" syntax.
14433
14434 Return 0 if the argument has been formatted into arg->str.
14435 Return 1 if the argument has been written into ctx->writer,
14436 Raise an exception and return -1 on error. */
14437static int
14438unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14439 struct unicode_format_arg_t *arg)
14440{
14441#define FORMAT_READ(ctx) \
14442 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14443
14444 PyObject *v;
14445
Victor Stinnera47082312012-10-04 02:19:54 +020014446 if (arg->ch == '(') {
14447 /* Get argument value from a dictionary. Example: "%(name)s". */
14448 Py_ssize_t keystart;
14449 Py_ssize_t keylen;
14450 PyObject *key;
14451 int pcount = 1;
14452
14453 if (ctx->dict == NULL) {
14454 PyErr_SetString(PyExc_TypeError,
14455 "format requires a mapping");
14456 return -1;
14457 }
14458 ++ctx->fmtpos;
14459 --ctx->fmtcnt;
14460 keystart = ctx->fmtpos;
14461 /* Skip over balanced parentheses */
14462 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14463 arg->ch = FORMAT_READ(ctx);
14464 if (arg->ch == ')')
14465 --pcount;
14466 else if (arg->ch == '(')
14467 ++pcount;
14468 ctx->fmtpos++;
14469 }
14470 keylen = ctx->fmtpos - keystart - 1;
14471 if (ctx->fmtcnt < 0 || pcount > 0) {
14472 PyErr_SetString(PyExc_ValueError,
14473 "incomplete format key");
14474 return -1;
14475 }
14476 key = PyUnicode_Substring(ctx->fmtstr,
14477 keystart, keystart + keylen);
14478 if (key == NULL)
14479 return -1;
14480 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014481 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014482 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014483 }
14484 ctx->args = PyObject_GetItem(ctx->dict, key);
14485 Py_DECREF(key);
14486 if (ctx->args == NULL)
14487 return -1;
14488 ctx->args_owned = 1;
14489 ctx->arglen = -1;
14490 ctx->argidx = -2;
14491 }
14492
14493 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014494 while (--ctx->fmtcnt >= 0) {
14495 arg->ch = FORMAT_READ(ctx);
14496 ctx->fmtpos++;
14497 switch (arg->ch) {
14498 case '-': arg->flags |= F_LJUST; continue;
14499 case '+': arg->flags |= F_SIGN; continue;
14500 case ' ': arg->flags |= F_BLANK; continue;
14501 case '#': arg->flags |= F_ALT; continue;
14502 case '0': arg->flags |= F_ZERO; continue;
14503 }
14504 break;
14505 }
14506
14507 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014508 if (arg->ch == '*') {
14509 v = unicode_format_getnextarg(ctx);
14510 if (v == NULL)
14511 return -1;
14512 if (!PyLong_Check(v)) {
14513 PyErr_SetString(PyExc_TypeError,
14514 "* wants int");
14515 return -1;
14516 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014517 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014518 if (arg->width == -1 && PyErr_Occurred())
14519 return -1;
14520 if (arg->width < 0) {
14521 arg->flags |= F_LJUST;
14522 arg->width = -arg->width;
14523 }
14524 if (--ctx->fmtcnt >= 0) {
14525 arg->ch = FORMAT_READ(ctx);
14526 ctx->fmtpos++;
14527 }
14528 }
14529 else if (arg->ch >= '0' && arg->ch <= '9') {
14530 arg->width = arg->ch - '0';
14531 while (--ctx->fmtcnt >= 0) {
14532 arg->ch = FORMAT_READ(ctx);
14533 ctx->fmtpos++;
14534 if (arg->ch < '0' || arg->ch > '9')
14535 break;
14536 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14537 mixing signed and unsigned comparison. Since arg->ch is between
14538 '0' and '9', casting to int is safe. */
14539 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14540 PyErr_SetString(PyExc_ValueError,
14541 "width too big");
14542 return -1;
14543 }
14544 arg->width = arg->width*10 + (arg->ch - '0');
14545 }
14546 }
14547
14548 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014549 if (arg->ch == '.') {
14550 arg->prec = 0;
14551 if (--ctx->fmtcnt >= 0) {
14552 arg->ch = FORMAT_READ(ctx);
14553 ctx->fmtpos++;
14554 }
14555 if (arg->ch == '*') {
14556 v = unicode_format_getnextarg(ctx);
14557 if (v == NULL)
14558 return -1;
14559 if (!PyLong_Check(v)) {
14560 PyErr_SetString(PyExc_TypeError,
14561 "* wants int");
14562 return -1;
14563 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014564 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014565 if (arg->prec == -1 && PyErr_Occurred())
14566 return -1;
14567 if (arg->prec < 0)
14568 arg->prec = 0;
14569 if (--ctx->fmtcnt >= 0) {
14570 arg->ch = FORMAT_READ(ctx);
14571 ctx->fmtpos++;
14572 }
14573 }
14574 else if (arg->ch >= '0' && arg->ch <= '9') {
14575 arg->prec = arg->ch - '0';
14576 while (--ctx->fmtcnt >= 0) {
14577 arg->ch = FORMAT_READ(ctx);
14578 ctx->fmtpos++;
14579 if (arg->ch < '0' || arg->ch > '9')
14580 break;
14581 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14582 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014583 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014584 return -1;
14585 }
14586 arg->prec = arg->prec*10 + (arg->ch - '0');
14587 }
14588 }
14589 }
14590
14591 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14592 if (ctx->fmtcnt >= 0) {
14593 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14594 if (--ctx->fmtcnt >= 0) {
14595 arg->ch = FORMAT_READ(ctx);
14596 ctx->fmtpos++;
14597 }
14598 }
14599 }
14600 if (ctx->fmtcnt < 0) {
14601 PyErr_SetString(PyExc_ValueError,
14602 "incomplete format");
14603 return -1;
14604 }
14605 return 0;
14606
14607#undef FORMAT_READ
14608}
14609
14610/* Format one argument. Supported conversion specifiers:
14611
14612 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014613 - "i", "d", "u": int or float
14614 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014615 - "e", "E", "f", "F", "g", "G": float
14616 - "c": int or str (1 character)
14617
Victor Stinner8dbd4212012-12-04 09:30:24 +010014618 When possible, the output is written directly into the Unicode writer
14619 (ctx->writer). A string is created when padding is required.
14620
Victor Stinnera47082312012-10-04 02:19:54 +020014621 Return 0 if the argument has been formatted into *p_str,
14622 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014623 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014624static int
14625unicode_format_arg_format(struct unicode_formatter_t *ctx,
14626 struct unicode_format_arg_t *arg,
14627 PyObject **p_str)
14628{
14629 PyObject *v;
14630 _PyUnicodeWriter *writer = &ctx->writer;
14631
14632 if (ctx->fmtcnt == 0)
14633 ctx->writer.overallocate = 0;
14634
Victor Stinnera47082312012-10-04 02:19:54 +020014635 v = unicode_format_getnextarg(ctx);
14636 if (v == NULL)
14637 return -1;
14638
Victor Stinnera47082312012-10-04 02:19:54 +020014639
14640 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014641 case 's':
14642 case 'r':
14643 case 'a':
14644 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14645 /* Fast path */
14646 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14647 return -1;
14648 return 1;
14649 }
14650
14651 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14652 *p_str = v;
14653 Py_INCREF(*p_str);
14654 }
14655 else {
14656 if (arg->ch == 's')
14657 *p_str = PyObject_Str(v);
14658 else if (arg->ch == 'r')
14659 *p_str = PyObject_Repr(v);
14660 else
14661 *p_str = PyObject_ASCII(v);
14662 }
14663 break;
14664
14665 case 'i':
14666 case 'd':
14667 case 'u':
14668 case 'o':
14669 case 'x':
14670 case 'X':
14671 {
14672 int ret = mainformatlong(v, arg, p_str, writer);
14673 if (ret != 0)
14674 return ret;
14675 arg->sign = 1;
14676 break;
14677 }
14678
14679 case 'e':
14680 case 'E':
14681 case 'f':
14682 case 'F':
14683 case 'g':
14684 case 'G':
14685 if (arg->width == -1 && arg->prec == -1
14686 && !(arg->flags & (F_SIGN | F_BLANK)))
14687 {
14688 /* Fast path */
14689 if (formatfloat(v, arg, NULL, writer) == -1)
14690 return -1;
14691 return 1;
14692 }
14693
14694 arg->sign = 1;
14695 if (formatfloat(v, arg, p_str, NULL) == -1)
14696 return -1;
14697 break;
14698
14699 case 'c':
14700 {
14701 Py_UCS4 ch = formatchar(v);
14702 if (ch == (Py_UCS4) -1)
14703 return -1;
14704 if (arg->width == -1 && arg->prec == -1) {
14705 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014706 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014707 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014708 return 1;
14709 }
14710 *p_str = PyUnicode_FromOrdinal(ch);
14711 break;
14712 }
14713
14714 default:
14715 PyErr_Format(PyExc_ValueError,
14716 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014717 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014718 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14719 (int)arg->ch,
14720 ctx->fmtpos - 1);
14721 return -1;
14722 }
14723 if (*p_str == NULL)
14724 return -1;
14725 assert (PyUnicode_Check(*p_str));
14726 return 0;
14727}
14728
14729static int
14730unicode_format_arg_output(struct unicode_formatter_t *ctx,
14731 struct unicode_format_arg_t *arg,
14732 PyObject *str)
14733{
14734 Py_ssize_t len;
14735 enum PyUnicode_Kind kind;
14736 void *pbuf;
14737 Py_ssize_t pindex;
14738 Py_UCS4 signchar;
14739 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014740 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014741 Py_ssize_t sublen;
14742 _PyUnicodeWriter *writer = &ctx->writer;
14743 Py_UCS4 fill;
14744
14745 fill = ' ';
14746 if (arg->sign && arg->flags & F_ZERO)
14747 fill = '0';
14748
14749 if (PyUnicode_READY(str) == -1)
14750 return -1;
14751
14752 len = PyUnicode_GET_LENGTH(str);
14753 if ((arg->width == -1 || arg->width <= len)
14754 && (arg->prec == -1 || arg->prec >= len)
14755 && !(arg->flags & (F_SIGN | F_BLANK)))
14756 {
14757 /* Fast path */
14758 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14759 return -1;
14760 return 0;
14761 }
14762
14763 /* Truncate the string for "s", "r" and "a" formats
14764 if the precision is set */
14765 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14766 if (arg->prec >= 0 && len > arg->prec)
14767 len = arg->prec;
14768 }
14769
14770 /* Adjust sign and width */
14771 kind = PyUnicode_KIND(str);
14772 pbuf = PyUnicode_DATA(str);
14773 pindex = 0;
14774 signchar = '\0';
14775 if (arg->sign) {
14776 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14777 if (ch == '-' || ch == '+') {
14778 signchar = ch;
14779 len--;
14780 pindex++;
14781 }
14782 else if (arg->flags & F_SIGN)
14783 signchar = '+';
14784 else if (arg->flags & F_BLANK)
14785 signchar = ' ';
14786 else
14787 arg->sign = 0;
14788 }
14789 if (arg->width < len)
14790 arg->width = len;
14791
14792 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014793 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014794 if (!(arg->flags & F_LJUST)) {
14795 if (arg->sign) {
14796 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014797 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014798 }
14799 else {
14800 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014801 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014802 }
14803 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014804 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14805 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014806 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014807 }
14808
Victor Stinnera47082312012-10-04 02:19:54 +020014809 buflen = arg->width;
14810 if (arg->sign && len == arg->width)
14811 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014812 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014813 return -1;
14814
14815 /* Write the sign if needed */
14816 if (arg->sign) {
14817 if (fill != ' ') {
14818 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14819 writer->pos += 1;
14820 }
14821 if (arg->width > len)
14822 arg->width--;
14823 }
14824
14825 /* Write the numeric prefix for "x", "X" and "o" formats
14826 if the alternate form is used.
14827 For example, write "0x" for the "%#x" format. */
14828 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14829 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14830 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14831 if (fill != ' ') {
14832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14833 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14834 writer->pos += 2;
14835 pindex += 2;
14836 }
14837 arg->width -= 2;
14838 if (arg->width < 0)
14839 arg->width = 0;
14840 len -= 2;
14841 }
14842
14843 /* Pad left with the fill character if needed */
14844 if (arg->width > len && !(arg->flags & F_LJUST)) {
14845 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014846 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014847 writer->pos += sublen;
14848 arg->width = len;
14849 }
14850
14851 /* If padding with spaces: write sign if needed and/or numeric prefix if
14852 the alternate form is used */
14853 if (fill == ' ') {
14854 if (arg->sign) {
14855 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14856 writer->pos += 1;
14857 }
14858 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14859 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14860 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14861 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14862 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14863 writer->pos += 2;
14864 pindex += 2;
14865 }
14866 }
14867
14868 /* Write characters */
14869 if (len) {
14870 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14871 str, pindex, len);
14872 writer->pos += len;
14873 }
14874
14875 /* Pad right with the fill character if needed */
14876 if (arg->width > len) {
14877 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014878 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014879 writer->pos += sublen;
14880 }
14881 return 0;
14882}
14883
14884/* Helper of PyUnicode_Format(): format one arg.
14885 Return 0 on success, raise an exception and return -1 on error. */
14886static int
14887unicode_format_arg(struct unicode_formatter_t *ctx)
14888{
14889 struct unicode_format_arg_t arg;
14890 PyObject *str;
14891 int ret;
14892
Victor Stinner8dbd4212012-12-04 09:30:24 +010014893 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014894 if (arg.ch == '%') {
14895 ctx->fmtpos++;
14896 ctx->fmtcnt--;
14897 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14898 return -1;
14899 return 0;
14900 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014901 arg.flags = 0;
14902 arg.width = -1;
14903 arg.prec = -1;
14904 arg.sign = 0;
14905 str = NULL;
14906
Victor Stinnera47082312012-10-04 02:19:54 +020014907 ret = unicode_format_arg_parse(ctx, &arg);
14908 if (ret == -1)
14909 return -1;
14910
14911 ret = unicode_format_arg_format(ctx, &arg, &str);
14912 if (ret == -1)
14913 return -1;
14914
14915 if (ret != 1) {
14916 ret = unicode_format_arg_output(ctx, &arg, str);
14917 Py_DECREF(str);
14918 if (ret == -1)
14919 return -1;
14920 }
14921
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014922 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014923 PyErr_SetString(PyExc_TypeError,
14924 "not all arguments converted during string formatting");
14925 return -1;
14926 }
14927 return 0;
14928}
14929
Alexander Belopolsky40018472011-02-26 01:02:56 +000014930PyObject *
14931PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014932{
Victor Stinnera47082312012-10-04 02:19:54 +020014933 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014934
Guido van Rossumd57fd912000-03-10 22:53:23 +000014935 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014936 PyErr_BadInternalCall();
14937 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014938 }
Victor Stinnera47082312012-10-04 02:19:54 +020014939
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014940 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014941 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014942
14943 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014944 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14945 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14946 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14947 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014948
Victor Stinner8f674cc2013-04-17 23:02:17 +020014949 _PyUnicodeWriter_Init(&ctx.writer);
14950 ctx.writer.min_length = ctx.fmtcnt + 100;
14951 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014952
Guido van Rossumd57fd912000-03-10 22:53:23 +000014953 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014954 ctx.arglen = PyTuple_Size(args);
14955 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014956 }
14957 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014958 ctx.arglen = -1;
14959 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014960 }
Victor Stinnera47082312012-10-04 02:19:54 +020014961 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014962 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014963 ctx.dict = args;
14964 else
14965 ctx.dict = NULL;
14966 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014967
Victor Stinnera47082312012-10-04 02:19:54 +020014968 while (--ctx.fmtcnt >= 0) {
14969 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014970 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014971
14972 nonfmtpos = ctx.fmtpos++;
14973 while (ctx.fmtcnt >= 0 &&
14974 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14975 ctx.fmtpos++;
14976 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014977 }
Victor Stinnera47082312012-10-04 02:19:54 +020014978 if (ctx.fmtcnt < 0) {
14979 ctx.fmtpos--;
14980 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014981 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014982
Victor Stinnercfc4c132013-04-03 01:48:39 +020014983 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14984 nonfmtpos, ctx.fmtpos) < 0)
14985 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014986 }
14987 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014988 ctx.fmtpos++;
14989 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014990 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014991 }
14992 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014993
Victor Stinnera47082312012-10-04 02:19:54 +020014994 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014995 PyErr_SetString(PyExc_TypeError,
14996 "not all arguments converted during string formatting");
14997 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014998 }
14999
Victor Stinnera47082312012-10-04 02:19:54 +020015000 if (ctx.args_owned) {
15001 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015002 }
Victor Stinnera47082312012-10-04 02:19:54 +020015003 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015004
Benjamin Peterson29060642009-01-31 22:14:21 +000015005 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015006 _PyUnicodeWriter_Dealloc(&ctx.writer);
15007 if (ctx.args_owned) {
15008 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015009 }
15010 return NULL;
15011}
15012
Jeremy Hylton938ace62002-07-17 16:30:39 +000015013static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015014unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15015
Tim Peters6d6c1a32001-08-02 04:15:00 +000015016static PyObject *
15017unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15018{
Benjamin Peterson29060642009-01-31 22:14:21 +000015019 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015020 static char *kwlist[] = {"object", "encoding", "errors", 0};
15021 char *encoding = NULL;
15022 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015023
Benjamin Peterson14339b62009-01-31 16:36:08 +000015024 if (type != &PyUnicode_Type)
15025 return unicode_subtype_new(type, args, kwds);
15026 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015027 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 return NULL;
15029 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015030 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015031 if (encoding == NULL && errors == NULL)
15032 return PyObject_Str(x);
15033 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015034 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015035}
15036
Guido van Rossume023fe02001-08-30 03:12:59 +000015037static PyObject *
15038unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15039{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015040 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015041 Py_ssize_t length, char_size;
15042 int share_wstr, share_utf8;
15043 unsigned int kind;
15044 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015045
Benjamin Peterson14339b62009-01-31 16:36:08 +000015046 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015047
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015048 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015049 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015050 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015051 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015052 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015053 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015054 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015055 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015056
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015057 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058 if (self == NULL) {
15059 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 return NULL;
15061 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 kind = PyUnicode_KIND(unicode);
15063 length = PyUnicode_GET_LENGTH(unicode);
15064
15065 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015066#ifdef Py_DEBUG
15067 _PyUnicode_HASH(self) = -1;
15068#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015069 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015070#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015071 _PyUnicode_STATE(self).interned = 0;
15072 _PyUnicode_STATE(self).kind = kind;
15073 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015074 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 _PyUnicode_STATE(self).ready = 1;
15076 _PyUnicode_WSTR(self) = NULL;
15077 _PyUnicode_UTF8_LENGTH(self) = 0;
15078 _PyUnicode_UTF8(self) = NULL;
15079 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015080 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015081
15082 share_utf8 = 0;
15083 share_wstr = 0;
15084 if (kind == PyUnicode_1BYTE_KIND) {
15085 char_size = 1;
15086 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15087 share_utf8 = 1;
15088 }
15089 else if (kind == PyUnicode_2BYTE_KIND) {
15090 char_size = 2;
15091 if (sizeof(wchar_t) == 2)
15092 share_wstr = 1;
15093 }
15094 else {
15095 assert(kind == PyUnicode_4BYTE_KIND);
15096 char_size = 4;
15097 if (sizeof(wchar_t) == 4)
15098 share_wstr = 1;
15099 }
15100
15101 /* Ensure we won't overflow the length. */
15102 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15103 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015104 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015105 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015106 data = PyObject_MALLOC((length + 1) * char_size);
15107 if (data == NULL) {
15108 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015109 goto onError;
15110 }
15111
Victor Stinnerc3c74152011-10-02 20:39:55 +020015112 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015113 if (share_utf8) {
15114 _PyUnicode_UTF8_LENGTH(self) = length;
15115 _PyUnicode_UTF8(self) = data;
15116 }
15117 if (share_wstr) {
15118 _PyUnicode_WSTR_LENGTH(self) = length;
15119 _PyUnicode_WSTR(self) = (wchar_t *)data;
15120 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015121
Christian Heimesf051e432016-09-13 20:22:02 +020015122 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015123 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015124 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015125#ifdef Py_DEBUG
15126 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15127#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015128 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015129 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015130
15131onError:
15132 Py_DECREF(unicode);
15133 Py_DECREF(self);
15134 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015135}
15136
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015137PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015138"str(object='') -> str\n\
15139str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015140\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015141Create a new string object from the given object. If encoding or\n\
15142errors is specified, then the object must expose a data buffer\n\
15143that will be decoded using the given encoding and error handler.\n\
15144Otherwise, returns the result of object.__str__() (if defined)\n\
15145or repr(object).\n\
15146encoding defaults to sys.getdefaultencoding().\n\
15147errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015148
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015149static PyObject *unicode_iter(PyObject *seq);
15150
Guido van Rossumd57fd912000-03-10 22:53:23 +000015151PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015152 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015153 "str", /* tp_name */
15154 sizeof(PyUnicodeObject), /* tp_basicsize */
15155 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015156 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015157 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015158 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015159 0, /* tp_getattr */
15160 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015161 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015162 unicode_repr, /* tp_repr */
15163 &unicode_as_number, /* tp_as_number */
15164 &unicode_as_sequence, /* tp_as_sequence */
15165 &unicode_as_mapping, /* tp_as_mapping */
15166 (hashfunc) unicode_hash, /* tp_hash*/
15167 0, /* tp_call*/
15168 (reprfunc) unicode_str, /* tp_str */
15169 PyObject_GenericGetAttr, /* tp_getattro */
15170 0, /* tp_setattro */
15171 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015172 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015173 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15174 unicode_doc, /* tp_doc */
15175 0, /* tp_traverse */
15176 0, /* tp_clear */
15177 PyUnicode_RichCompare, /* tp_richcompare */
15178 0, /* tp_weaklistoffset */
15179 unicode_iter, /* tp_iter */
15180 0, /* tp_iternext */
15181 unicode_methods, /* tp_methods */
15182 0, /* tp_members */
15183 0, /* tp_getset */
15184 &PyBaseObject_Type, /* tp_base */
15185 0, /* tp_dict */
15186 0, /* tp_descr_get */
15187 0, /* tp_descr_set */
15188 0, /* tp_dictoffset */
15189 0, /* tp_init */
15190 0, /* tp_alloc */
15191 unicode_new, /* tp_new */
15192 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015193};
15194
15195/* Initialize the Unicode implementation */
15196
Victor Stinner331a6a52019-05-27 16:39:22 +020015197PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015198_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015199{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015200 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015201 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015202 0x000A, /* LINE FEED */
15203 0x000D, /* CARRIAGE RETURN */
15204 0x001C, /* FILE SEPARATOR */
15205 0x001D, /* GROUP SEPARATOR */
15206 0x001E, /* RECORD SEPARATOR */
15207 0x0085, /* NEXT LINE */
15208 0x2028, /* LINE SEPARATOR */
15209 0x2029, /* PARAGRAPH SEPARATOR */
15210 };
15211
Fred Drakee4315f52000-05-09 19:53:39 +000015212 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015213 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015214 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015215 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015216 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015217 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015218
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015219 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015220 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015221 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015222
15223 /* initialize the linebreak bloom filter */
15224 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015225 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015226 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015227
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015228 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015229 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015230 }
15231 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015232 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015233 }
15234 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015235 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015236 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015237 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015238}
15239
15240/* Finalize the Unicode implementation */
15241
Christian Heimesa156e092008-02-16 07:38:31 +000015242int
15243PyUnicode_ClearFreeList(void)
15244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015245 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015246}
15247
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015248
Walter Dörwald16807132007-05-25 13:52:07 +000015249void
15250PyUnicode_InternInPlace(PyObject **p)
15251{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015252 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015253 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015254#ifdef Py_DEBUG
15255 assert(s != NULL);
15256 assert(_PyUnicode_CHECK(s));
15257#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015258 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015259 return;
15260#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 /* If it's a subclass, we don't really know what putting
15262 it in the interned dict might do. */
15263 if (!PyUnicode_CheckExact(s))
15264 return;
15265 if (PyUnicode_CHECK_INTERNED(s))
15266 return;
15267 if (interned == NULL) {
15268 interned = PyDict_New();
15269 if (interned == NULL) {
15270 PyErr_Clear(); /* Don't leave an exception */
15271 return;
15272 }
15273 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015274 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015275 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015276 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015277 if (t == NULL) {
15278 PyErr_Clear();
15279 return;
15280 }
15281 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015282 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015283 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015284 return;
15285 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015286 /* The two references in interned are not counted by refcnt.
15287 The deallocator will take care of this */
15288 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015289 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015290}
15291
15292void
15293PyUnicode_InternImmortal(PyObject **p)
15294{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015295 PyUnicode_InternInPlace(p);
15296 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015297 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015298 Py_INCREF(*p);
15299 }
Walter Dörwald16807132007-05-25 13:52:07 +000015300}
15301
15302PyObject *
15303PyUnicode_InternFromString(const char *cp)
15304{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015305 PyObject *s = PyUnicode_FromString(cp);
15306 if (s == NULL)
15307 return NULL;
15308 PyUnicode_InternInPlace(&s);
15309 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015310}
15311
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015312
15313#if defined(WITH_VALGRIND) || defined(__INSURE__)
15314static void
15315unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015316{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015317 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015318 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 Py_ssize_t i, n;
15320 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015321
Benjamin Peterson14339b62009-01-31 16:36:08 +000015322 if (interned == NULL || !PyDict_Check(interned))
15323 return;
15324 keys = PyDict_Keys(interned);
15325 if (keys == NULL || !PyList_Check(keys)) {
15326 PyErr_Clear();
15327 return;
15328 }
Walter Dörwald16807132007-05-25 13:52:07 +000015329
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015330 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 detector, interned unicode strings are not forcibly deallocated;
15332 rather, we give them their stolen references back, and then clear
15333 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015334
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015336#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015338 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015339#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015340 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015341 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015342 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015343 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015344 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015345 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 case SSTATE_NOT_INTERNED:
15347 /* XXX Shouldn't happen */
15348 break;
15349 case SSTATE_INTERNED_IMMORTAL:
15350 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015351 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 break;
15353 case SSTATE_INTERNED_MORTAL:
15354 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015355 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 break;
15357 default:
15358 Py_FatalError("Inconsistent interned string state.");
15359 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015360 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015361 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015362#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015363 fprintf(stderr, "total size of all interned strings: "
15364 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15365 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015366#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 Py_DECREF(keys);
15368 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015369 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015370}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015371#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015372
15373
15374/********************* Unicode Iterator **************************/
15375
15376typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 PyObject_HEAD
15378 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015379 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015380} unicodeiterobject;
15381
15382static void
15383unicodeiter_dealloc(unicodeiterobject *it)
15384{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015385 _PyObject_GC_UNTRACK(it);
15386 Py_XDECREF(it->it_seq);
15387 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015388}
15389
15390static int
15391unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15392{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015393 Py_VISIT(it->it_seq);
15394 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015395}
15396
15397static PyObject *
15398unicodeiter_next(unicodeiterobject *it)
15399{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015400 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015401
Benjamin Peterson14339b62009-01-31 16:36:08 +000015402 assert(it != NULL);
15403 seq = it->it_seq;
15404 if (seq == NULL)
15405 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015406 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015407
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015408 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15409 int kind = PyUnicode_KIND(seq);
15410 void *data = PyUnicode_DATA(seq);
15411 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15412 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015413 if (item != NULL)
15414 ++it->it_index;
15415 return item;
15416 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015417
Benjamin Peterson14339b62009-01-31 16:36:08 +000015418 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015419 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015420 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015421}
15422
15423static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015424unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015425{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 Py_ssize_t len = 0;
15427 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015428 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015429 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015430}
15431
15432PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15433
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015434static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015435unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015436{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015437 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015438 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015439 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015440 it->it_seq, it->it_index);
15441 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015442 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015443 if (u == NULL)
15444 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015445 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015446 }
15447}
15448
15449PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15450
15451static PyObject *
15452unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15453{
15454 Py_ssize_t index = PyLong_AsSsize_t(state);
15455 if (index == -1 && PyErr_Occurred())
15456 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015457 if (it->it_seq != NULL) {
15458 if (index < 0)
15459 index = 0;
15460 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15461 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15462 it->it_index = index;
15463 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015464 Py_RETURN_NONE;
15465}
15466
15467PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15468
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015469static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015470 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015471 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015472 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15473 reduce_doc},
15474 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15475 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015476 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015477};
15478
15479PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015480 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15481 "str_iterator", /* tp_name */
15482 sizeof(unicodeiterobject), /* tp_basicsize */
15483 0, /* tp_itemsize */
15484 /* methods */
15485 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015486 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015487 0, /* tp_getattr */
15488 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015489 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015490 0, /* tp_repr */
15491 0, /* tp_as_number */
15492 0, /* tp_as_sequence */
15493 0, /* tp_as_mapping */
15494 0, /* tp_hash */
15495 0, /* tp_call */
15496 0, /* tp_str */
15497 PyObject_GenericGetAttr, /* tp_getattro */
15498 0, /* tp_setattro */
15499 0, /* tp_as_buffer */
15500 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15501 0, /* tp_doc */
15502 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15503 0, /* tp_clear */
15504 0, /* tp_richcompare */
15505 0, /* tp_weaklistoffset */
15506 PyObject_SelfIter, /* tp_iter */
15507 (iternextfunc)unicodeiter_next, /* tp_iternext */
15508 unicodeiter_methods, /* tp_methods */
15509 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015510};
15511
15512static PyObject *
15513unicode_iter(PyObject *seq)
15514{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015515 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015516
Benjamin Peterson14339b62009-01-31 16:36:08 +000015517 if (!PyUnicode_Check(seq)) {
15518 PyErr_BadInternalCall();
15519 return NULL;
15520 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015521 if (PyUnicode_READY(seq) == -1)
15522 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015523 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15524 if (it == NULL)
15525 return NULL;
15526 it->it_index = 0;
15527 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015528 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015529 _PyObject_GC_TRACK(it);
15530 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015531}
15532
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015533
15534size_t
15535Py_UNICODE_strlen(const Py_UNICODE *u)
15536{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015537 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015538}
15539
15540Py_UNICODE*
15541Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15542{
15543 Py_UNICODE *u = s1;
15544 while ((*u++ = *s2++));
15545 return s1;
15546}
15547
15548Py_UNICODE*
15549Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15550{
15551 Py_UNICODE *u = s1;
15552 while ((*u++ = *s2++))
15553 if (n-- == 0)
15554 break;
15555 return s1;
15556}
15557
15558Py_UNICODE*
15559Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15560{
15561 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015562 u1 += wcslen(u1);
15563 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015564 return s1;
15565}
15566
15567int
15568Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15569{
15570 while (*s1 && *s2 && *s1 == *s2)
15571 s1++, s2++;
15572 if (*s1 && *s2)
15573 return (*s1 < *s2) ? -1 : +1;
15574 if (*s1)
15575 return 1;
15576 if (*s2)
15577 return -1;
15578 return 0;
15579}
15580
15581int
15582Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15583{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015584 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015585 for (; n != 0; n--) {
15586 u1 = *s1;
15587 u2 = *s2;
15588 if (u1 != u2)
15589 return (u1 < u2) ? -1 : +1;
15590 if (u1 == '\0')
15591 return 0;
15592 s1++;
15593 s2++;
15594 }
15595 return 0;
15596}
15597
15598Py_UNICODE*
15599Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15600{
15601 const Py_UNICODE *p;
15602 for (p = s; *p; p++)
15603 if (*p == c)
15604 return (Py_UNICODE*)p;
15605 return NULL;
15606}
15607
15608Py_UNICODE*
15609Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15610{
15611 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015612 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015613 while (p != s) {
15614 p--;
15615 if (*p == c)
15616 return (Py_UNICODE*)p;
15617 }
15618 return NULL;
15619}
Victor Stinner331ea922010-08-10 16:37:20 +000015620
Victor Stinner71133ff2010-09-01 23:43:53 +000015621Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015622PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015623{
Victor Stinner577db2c2011-10-11 22:12:48 +020015624 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015625 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015626
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015627 if (!PyUnicode_Check(unicode)) {
15628 PyErr_BadArgument();
15629 return NULL;
15630 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015631 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015632 if (u == NULL)
15633 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015634 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015635 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015636 PyErr_NoMemory();
15637 return NULL;
15638 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015639 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015640 size *= sizeof(Py_UNICODE);
15641 copy = PyMem_Malloc(size);
15642 if (copy == NULL) {
15643 PyErr_NoMemory();
15644 return NULL;
15645 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015646 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015647 return copy;
15648}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015649
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015650
Victor Stinner709d23d2019-05-02 14:56:30 -040015651static int
15652encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015653{
Victor Stinner709d23d2019-05-02 14:56:30 -040015654 int res;
15655 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15656 if (res == -2) {
15657 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15658 return -1;
15659 }
15660 if (res < 0) {
15661 PyErr_NoMemory();
15662 return -1;
15663 }
15664 return 0;
15665}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015666
Victor Stinner709d23d2019-05-02 14:56:30 -040015667
15668static int
15669config_get_codec_name(wchar_t **config_encoding)
15670{
15671 char *encoding;
15672 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15673 return -1;
15674 }
15675
15676 PyObject *name_obj = NULL;
15677 PyObject *codec = _PyCodec_Lookup(encoding);
15678 PyMem_RawFree(encoding);
15679
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015680 if (!codec)
15681 goto error;
15682
15683 name_obj = PyObject_GetAttrString(codec, "name");
15684 Py_CLEAR(codec);
15685 if (!name_obj) {
15686 goto error;
15687 }
15688
Victor Stinner709d23d2019-05-02 14:56:30 -040015689 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15690 Py_DECREF(name_obj);
15691 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015692 goto error;
15693 }
15694
Victor Stinner709d23d2019-05-02 14:56:30 -040015695 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15696 if (raw_wname == NULL) {
15697 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015698 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015699 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015700 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015701
15702 PyMem_RawFree(*config_encoding);
15703 *config_encoding = raw_wname;
15704
15705 PyMem_Free(wname);
15706 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015707
15708error:
15709 Py_XDECREF(codec);
15710 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015711 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015712}
15713
15714
Victor Stinner331a6a52019-05-27 16:39:22 +020015715static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015716init_stdio_encoding(PyInterpreterState *interp)
15717{
Victor Stinner709d23d2019-05-02 14:56:30 -040015718 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015719 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015720 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015721 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015722 "of the stdio encoding");
15723 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015724 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015725}
15726
15727
Victor Stinner709d23d2019-05-02 14:56:30 -040015728static int
15729init_fs_codec(PyInterpreterState *interp)
15730{
Victor Stinner331a6a52019-05-27 16:39:22 +020015731 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015732
15733 _Py_error_handler error_handler;
15734 error_handler = get_error_handler_wide(config->filesystem_errors);
15735 if (error_handler == _Py_ERROR_UNKNOWN) {
15736 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15737 return -1;
15738 }
15739
15740 char *encoding, *errors;
15741 if (encode_wstr_utf8(config->filesystem_encoding,
15742 &encoding,
15743 "filesystem_encoding") < 0) {
15744 return -1;
15745 }
15746
15747 if (encode_wstr_utf8(config->filesystem_errors,
15748 &errors,
15749 "filesystem_errors") < 0) {
15750 PyMem_RawFree(encoding);
15751 return -1;
15752 }
15753
15754 PyMem_RawFree(interp->fs_codec.encoding);
15755 interp->fs_codec.encoding = encoding;
15756 PyMem_RawFree(interp->fs_codec.errors);
15757 interp->fs_codec.errors = errors;
15758 interp->fs_codec.error_handler = error_handler;
15759
15760 /* At this point, PyUnicode_EncodeFSDefault() and
15761 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15762 the C implementation of the filesystem encoding. */
15763
15764 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15765 global configuration variables. */
15766 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15767 interp->fs_codec.errors) < 0) {
15768 PyErr_NoMemory();
15769 return -1;
15770 }
15771 return 0;
15772}
15773
15774
Victor Stinner331a6a52019-05-27 16:39:22 +020015775static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015776init_fs_encoding(PyInterpreterState *interp)
15777{
Victor Stinner709d23d2019-05-02 14:56:30 -040015778 /* Update the filesystem encoding to the normalized Python codec name.
15779 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15780 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015781 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015782 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015783 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015784 "of the filesystem encoding");
15785 }
15786
Victor Stinner709d23d2019-05-02 14:56:30 -040015787 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015788 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015789 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015790 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015791}
15792
15793
Victor Stinner331a6a52019-05-27 16:39:22 +020015794PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015795_PyUnicode_InitEncodings(PyInterpreterState *interp)
15796{
Victor Stinner331a6a52019-05-27 16:39:22 +020015797 PyStatus status = init_fs_encoding(interp);
15798 if (_PyStatus_EXCEPTION(status)) {
15799 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015800 }
15801
15802 return init_stdio_encoding(interp);
15803}
15804
15805
Victor Stinner709d23d2019-05-02 14:56:30 -040015806#ifdef MS_WINDOWS
15807int
15808_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15809{
15810 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015811 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015812
15813 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15814 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15815 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15816 if (encoding == NULL || errors == NULL) {
15817 PyMem_RawFree(encoding);
15818 PyMem_RawFree(errors);
15819 PyErr_NoMemory();
15820 return -1;
15821 }
15822
15823 PyMem_RawFree(config->filesystem_encoding);
15824 config->filesystem_encoding = encoding;
15825 PyMem_RawFree(config->filesystem_errors);
15826 config->filesystem_errors = errors;
15827
15828 return init_fs_codec(interp);
15829}
15830#endif
15831
15832
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015833void
15834_PyUnicode_Fini(void)
15835{
15836#if defined(WITH_VALGRIND) || defined(__INSURE__)
15837 /* Insure++ is a memory analysis tool that aids in discovering
15838 * memory leaks and other memory problems. On Python exit, the
15839 * interned string dictionaries are flagged as being in use at exit
15840 * (which it is). Under normal circumstances, this is fine because
15841 * the memory will be automatically reclaimed by the system. Under
15842 * memory debugging, it's a huge source of useless noise, so we
15843 * trade off slower shutdown for less distraction in the memory
15844 * reports. -baw
15845 */
15846 unicode_release_interned();
15847#endif /* __INSURE__ */
15848
15849 Py_CLEAR(unicode_empty);
15850
15851 for (Py_ssize_t i = 0; i < 256; i++) {
15852 Py_CLEAR(unicode_latin1[i]);
15853 }
15854 _PyUnicode_ClearStaticStrings();
15855 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015856
15857 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15858 PyMem_RawFree(interp->fs_codec.encoding);
15859 interp->fs_codec.encoding = NULL;
15860 PyMem_RawFree(interp->fs_codec.errors);
15861 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015862}
15863
15864
Georg Brandl66c221e2010-10-14 07:04:07 +000015865/* A _string module, to export formatter_parser and formatter_field_name_split
15866 to the string.Formatter class implemented in Python. */
15867
15868static PyMethodDef _string_methods[] = {
15869 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15870 METH_O, PyDoc_STR("split the argument as a field name")},
15871 {"formatter_parser", (PyCFunction) formatter_parser,
15872 METH_O, PyDoc_STR("parse the argument as a format string")},
15873 {NULL, NULL}
15874};
15875
15876static struct PyModuleDef _string_module = {
15877 PyModuleDef_HEAD_INIT,
15878 "_string",
15879 PyDoc_STR("string helper module"),
15880 0,
15881 _string_methods,
15882 NULL,
15883 NULL,
15884 NULL,
15885 NULL
15886};
15887
15888PyMODINIT_FUNC
15889PyInit__string(void)
15890{
15891 return PyModule_Create(&_string_module);
15892}
15893
15894
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015895#ifdef __cplusplus
15896}
15897#endif