blob: 9eef05a6216766d8db1f025442fdc4b821220c05 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Victor Stinner709d23d2019-05-02 14:56:30 -0400268static PyObject *
269unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270 const char *errors);
271static PyObject *
272unicode_decode_utf8(const char *s, Py_ssize_t size,
273 _Py_error_handler error_handler, const char *errors,
274 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200276/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200277static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279/* Single character Unicode strings in the Latin-1 range are being
280 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200281static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282
Christian Heimes190d79e2008-01-30 11:58:22 +0000283/* Fast detection of the most frequent whitespace characters */
284const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000286/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000C: * FORM FEED */
290/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 1, 1, 1, 1, 1, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* case 0x001C: * FILE SEPARATOR */
294/* case 0x001D: * GROUP SEPARATOR */
295/* case 0x001E: * RECORD SEPARATOR */
296/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 1, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000303
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000312};
313
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200314/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200315static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100317static int unicode_modifiable(PyObject *unicode);
318
Victor Stinnerfe226c02011-10-03 03:52:20 +0200319
Alexander Belopolsky40018472011-02-26 01:02:56 +0000320static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100321_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200322static PyObject *
323_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
324static PyObject *
325_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
326
327static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000328unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000329 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100330 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
332
Alexander Belopolsky40018472011-02-26 01:02:56 +0000333static void
334raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300335 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100336 PyObject *unicode,
337 Py_ssize_t startpos, Py_ssize_t endpos,
338 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000339
Christian Heimes190d79e2008-01-30 11:58:22 +0000340/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200341static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000343/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000344/* 0x000B, * LINE TABULATION */
345/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000348 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x001C, * FILE SEPARATOR */
350/* 0x001D, * GROUP SEPARATOR */
351/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000352 0, 0, 0, 0, 1, 1, 1, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000357
Benjamin Peterson14339b62009-01-31 16:36:08 +0000358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000366};
367
INADA Naoki3ae20562017-01-16 20:41:20 +0900368static int convert_uc(PyObject *obj, void *addr);
369
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300370#include "clinic/unicodeobject.c.h"
371
Victor Stinner3d4226a2018-08-29 22:21:32 +0200372_Py_error_handler
373_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200374{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200376 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200385 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200394 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_OTHER;
397}
398
Victor Stinner709d23d2019-05-02 14:56:30 -0400399
400static _Py_error_handler
401get_error_handler_wide(const wchar_t *errors)
402{
403 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
404 return _Py_ERROR_STRICT;
405 }
406 if (wcscmp(errors, L"surrogateescape") == 0) {
407 return _Py_ERROR_SURROGATEESCAPE;
408 }
409 if (wcscmp(errors, L"replace") == 0) {
410 return _Py_ERROR_REPLACE;
411 }
412 if (wcscmp(errors, L"ignore") == 0) {
413 return _Py_ERROR_IGNORE;
414 }
415 if (wcscmp(errors, L"backslashreplace") == 0) {
416 return _Py_ERROR_BACKSLASHREPLACE;
417 }
418 if (wcscmp(errors, L"surrogatepass") == 0) {
419 return _Py_ERROR_SURROGATEPASS;
420 }
421 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
422 return _Py_ERROR_XMLCHARREFREPLACE;
423 }
424 return _Py_ERROR_OTHER;
425}
426
427
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300428/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
429 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000430Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000431PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000432{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000433#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000434 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000435#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000436 /* This is actually an illegal character, so it should
437 not be passed to unichr. */
438 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000439#endif
440}
441
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200442int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100443_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200444{
445 PyASCIIObject *ascii;
446 unsigned int kind;
447
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200448 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200449
450 ascii = (PyASCIIObject *)op;
451 kind = ascii->state.kind;
452
Victor Stinnera3b334d2011-10-03 13:53:37 +0200453 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200454 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
455 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200456 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200457 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200458 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Miss Islington (bot)d5ba8bb2019-08-27 10:13:52 -0700459#ifndef NDEBUG
Victor Stinner7f11ad42011-10-04 00:00:20 +0200460 void *data;
Miss Islington (bot)d5ba8bb2019-08-27 10:13:52 -0700461#endif
Victor Stinner910337b2011-10-03 03:20:16 +0200462
Victor Stinnera41463c2011-10-04 01:05:08 +0200463 if (ascii->state.compact == 1) {
Miss Islington (bot)d5ba8bb2019-08-27 10:13:52 -0700464#ifndef NDEBUG
Victor Stinnera41463c2011-10-04 01:05:08 +0200465 data = compact + 1;
Miss Islington (bot)d5ba8bb2019-08-27 10:13:52 -0700466#endif
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200467 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
468 || kind == PyUnicode_2BYTE_KIND
469 || kind == PyUnicode_4BYTE_KIND);
470 _PyObject_ASSERT(op, ascii->state.ascii == 0);
471 _PyObject_ASSERT(op, ascii->state.ready == 1);
472 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100473 }
474 else {
Miss Islington (bot)d5ba8bb2019-08-27 10:13:52 -0700475#ifndef NDEBUG
Victor Stinnera41463c2011-10-04 01:05:08 +0200476 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
477
478 data = unicode->data.any;
Miss Islington (bot)d5ba8bb2019-08-27 10:13:52 -0700479#endif
Victor Stinnera41463c2011-10-04 01:05:08 +0200480 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200481 _PyObject_ASSERT(op, ascii->length == 0);
482 _PyObject_ASSERT(op, ascii->hash == -1);
483 _PyObject_ASSERT(op, ascii->state.compact == 0);
484 _PyObject_ASSERT(op, ascii->state.ascii == 0);
485 _PyObject_ASSERT(op, ascii->state.ready == 0);
486 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
487 _PyObject_ASSERT(op, ascii->wstr != NULL);
488 _PyObject_ASSERT(op, data == NULL);
489 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200490 }
491 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200492 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
493 || kind == PyUnicode_2BYTE_KIND
494 || kind == PyUnicode_4BYTE_KIND);
495 _PyObject_ASSERT(op, ascii->state.compact == 0);
496 _PyObject_ASSERT(op, ascii->state.ready == 1);
497 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200498 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200499 _PyObject_ASSERT(op, compact->utf8 == data);
500 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200501 }
502 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200503 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200504 }
505 }
506 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200507 if (
508#if SIZEOF_WCHAR_T == 2
509 kind == PyUnicode_2BYTE_KIND
510#else
511 kind == PyUnicode_4BYTE_KIND
512#endif
513 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200514 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200515 _PyObject_ASSERT(op, ascii->wstr == data);
516 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200518 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200519 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200520
521 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200522 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200523 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200524 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200525 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200526
527 /* check that the best kind is used: O(n) operation */
528 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200529 Py_ssize_t i;
530 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200531 void *data;
532 Py_UCS4 ch;
533
534 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200535 for (i=0; i < ascii->length; i++)
536 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200537 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200538 if (ch > maxchar)
539 maxchar = ch;
540 }
541 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100542 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200543 _PyObject_ASSERT(op, maxchar >= 128);
544 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100545 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200546 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200547 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200548 }
Victor Stinner77faf692011-11-20 18:56:05 +0100549 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200550 _PyObject_ASSERT(op, maxchar >= 0x100);
551 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100552 }
553 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200554 _PyObject_ASSERT(op, maxchar >= 0x10000);
555 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100556 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200557 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200558 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400559 return 1;
560}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200561
Victor Stinner910337b2011-10-03 03:20:16 +0200562
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100563static PyObject*
564unicode_result_wchar(PyObject *unicode)
565{
566#ifndef Py_DEBUG
567 Py_ssize_t len;
568
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100569 len = _PyUnicode_WSTR_LENGTH(unicode);
570 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100571 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200572 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100573 }
574
575 if (len == 1) {
576 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100577 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100578 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
579 Py_DECREF(unicode);
580 return latin1_char;
581 }
582 }
583
584 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200585 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100586 return NULL;
587 }
588#else
Victor Stinneraa771272012-10-04 02:32:58 +0200589 assert(Py_REFCNT(unicode) == 1);
590
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100591 /* don't make the result ready in debug mode to ensure that the caller
592 makes the string ready before using it */
593 assert(_PyUnicode_CheckConsistency(unicode, 1));
594#endif
595 return unicode;
596}
597
598static PyObject*
599unicode_result_ready(PyObject *unicode)
600{
601 Py_ssize_t length;
602
603 length = PyUnicode_GET_LENGTH(unicode);
604 if (length == 0) {
605 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100606 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200607 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100608 }
609 return unicode_empty;
610 }
611
612 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200613 void *data = PyUnicode_DATA(unicode);
614 int kind = PyUnicode_KIND(unicode);
615 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100616 if (ch < 256) {
617 PyObject *latin1_char = unicode_latin1[ch];
618 if (latin1_char != NULL) {
619 if (unicode != latin1_char) {
620 Py_INCREF(latin1_char);
621 Py_DECREF(unicode);
622 }
623 return latin1_char;
624 }
625 else {
626 assert(_PyUnicode_CheckConsistency(unicode, 1));
627 Py_INCREF(unicode);
628 unicode_latin1[ch] = unicode;
629 return unicode;
630 }
631 }
632 }
633
634 assert(_PyUnicode_CheckConsistency(unicode, 1));
635 return unicode;
636}
637
638static PyObject*
639unicode_result(PyObject *unicode)
640{
641 assert(_PyUnicode_CHECK(unicode));
642 if (PyUnicode_IS_READY(unicode))
643 return unicode_result_ready(unicode);
644 else
645 return unicode_result_wchar(unicode);
646}
647
Victor Stinnerc4b49542011-12-11 22:44:26 +0100648static PyObject*
649unicode_result_unchanged(PyObject *unicode)
650{
651 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500652 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100653 return NULL;
654 Py_INCREF(unicode);
655 return unicode;
656 }
657 else
658 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100659 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100660}
661
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200662/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
663 ASCII, Latin1, UTF-8, etc. */
664static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200665backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200666 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
667{
Victor Stinnerad771582015-10-09 12:38:53 +0200668 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200669 Py_UCS4 ch;
670 enum PyUnicode_Kind kind;
671 void *data;
672
673 assert(PyUnicode_IS_READY(unicode));
674 kind = PyUnicode_KIND(unicode);
675 data = PyUnicode_DATA(unicode);
676
677 size = 0;
678 /* determine replacement size */
679 for (i = collstart; i < collend; ++i) {
680 Py_ssize_t incr;
681
682 ch = PyUnicode_READ(kind, data, i);
683 if (ch < 0x100)
684 incr = 2+2;
685 else if (ch < 0x10000)
686 incr = 2+4;
687 else {
688 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200689 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200690 }
691 if (size > PY_SSIZE_T_MAX - incr) {
692 PyErr_SetString(PyExc_OverflowError,
693 "encoded result is too long for a Python string");
694 return NULL;
695 }
696 size += incr;
697 }
698
Victor Stinnerad771582015-10-09 12:38:53 +0200699 str = _PyBytesWriter_Prepare(writer, str, size);
700 if (str == NULL)
701 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200702
703 /* generate replacement */
704 for (i = collstart; i < collend; ++i) {
705 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200706 *str++ = '\\';
707 if (ch >= 0x00010000) {
708 *str++ = 'U';
709 *str++ = Py_hexdigits[(ch>>28)&0xf];
710 *str++ = Py_hexdigits[(ch>>24)&0xf];
711 *str++ = Py_hexdigits[(ch>>20)&0xf];
712 *str++ = Py_hexdigits[(ch>>16)&0xf];
713 *str++ = Py_hexdigits[(ch>>12)&0xf];
714 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200715 }
Victor Stinner797485e2015-10-09 03:17:30 +0200716 else if (ch >= 0x100) {
717 *str++ = 'u';
718 *str++ = Py_hexdigits[(ch>>12)&0xf];
719 *str++ = Py_hexdigits[(ch>>8)&0xf];
720 }
721 else
722 *str++ = 'x';
723 *str++ = Py_hexdigits[(ch>>4)&0xf];
724 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200725 }
726 return str;
727}
728
729/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
730 ASCII, Latin1, UTF-8, etc. */
731static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200732xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200733 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
734{
Victor Stinnerad771582015-10-09 12:38:53 +0200735 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200736 Py_UCS4 ch;
737 enum PyUnicode_Kind kind;
738 void *data;
739
740 assert(PyUnicode_IS_READY(unicode));
741 kind = PyUnicode_KIND(unicode);
742 data = PyUnicode_DATA(unicode);
743
744 size = 0;
745 /* determine replacement size */
746 for (i = collstart; i < collend; ++i) {
747 Py_ssize_t incr;
748
749 ch = PyUnicode_READ(kind, data, i);
750 if (ch < 10)
751 incr = 2+1+1;
752 else if (ch < 100)
753 incr = 2+2+1;
754 else if (ch < 1000)
755 incr = 2+3+1;
756 else if (ch < 10000)
757 incr = 2+4+1;
758 else if (ch < 100000)
759 incr = 2+5+1;
760 else if (ch < 1000000)
761 incr = 2+6+1;
762 else {
763 assert(ch <= MAX_UNICODE);
764 incr = 2+7+1;
765 }
766 if (size > PY_SSIZE_T_MAX - incr) {
767 PyErr_SetString(PyExc_OverflowError,
768 "encoded result is too long for a Python string");
769 return NULL;
770 }
771 size += incr;
772 }
773
Victor Stinnerad771582015-10-09 12:38:53 +0200774 str = _PyBytesWriter_Prepare(writer, str, size);
775 if (str == NULL)
776 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777
778 /* generate replacement */
779 for (i = collstart; i < collend; ++i) {
780 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
781 }
782 return str;
783}
784
Thomas Wouters477c8d52006-05-27 19:21:47 +0000785/* --- Bloom Filters ----------------------------------------------------- */
786
787/* stuff to implement simple "bloom filters" for Unicode characters.
788 to keep things simple, we use a single bitmask, using the least 5
789 bits from each unicode characters as the bit index. */
790
791/* the linebreak mask is set up by Unicode_Init below */
792
Antoine Pitrouf068f942010-01-13 14:19:12 +0000793#if LONG_BIT >= 128
794#define BLOOM_WIDTH 128
795#elif LONG_BIT >= 64
796#define BLOOM_WIDTH 64
797#elif LONG_BIT >= 32
798#define BLOOM_WIDTH 32
799#else
800#error "LONG_BIT is smaller than 32"
801#endif
802
Thomas Wouters477c8d52006-05-27 19:21:47 +0000803#define BLOOM_MASK unsigned long
804
Serhiy Storchaka05997252013-01-26 12:14:02 +0200805static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806
Antoine Pitrouf068f942010-01-13 14:19:12 +0000807#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000808
Benjamin Peterson29060642009-01-31 22:14:21 +0000809#define BLOOM_LINEBREAK(ch) \
810 ((ch) < 128U ? ascii_linebreak[(ch)] : \
811 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000812
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700813static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200814make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000815{
Victor Stinnera85af502013-04-09 21:53:54 +0200816#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
817 do { \
818 TYPE *data = (TYPE *)PTR; \
819 TYPE *end = data + LEN; \
820 Py_UCS4 ch; \
821 for (; data != end; data++) { \
822 ch = *data; \
823 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
824 } \
825 break; \
826 } while (0)
827
Thomas Wouters477c8d52006-05-27 19:21:47 +0000828 /* calculate simple bloom-style bitmask for a given unicode string */
829
Antoine Pitrouf068f942010-01-13 14:19:12 +0000830 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000831
832 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200833 switch (kind) {
834 case PyUnicode_1BYTE_KIND:
835 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
836 break;
837 case PyUnicode_2BYTE_KIND:
838 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
839 break;
840 case PyUnicode_4BYTE_KIND:
841 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
842 break;
843 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700844 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200845 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000846 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200847
848#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000849}
850
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300851static int
852ensure_unicode(PyObject *obj)
853{
854 if (!PyUnicode_Check(obj)) {
855 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200856 "must be str, not %.100s",
857 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300858 return -1;
859 }
860 return PyUnicode_READY(obj);
861}
862
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200863/* Compilation of templated routines */
864
865#include "stringlib/asciilib.h"
866#include "stringlib/fastsearch.h"
867#include "stringlib/partition.h"
868#include "stringlib/split.h"
869#include "stringlib/count.h"
870#include "stringlib/find.h"
871#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200872#include "stringlib/undef.h"
873
874#include "stringlib/ucs1lib.h"
875#include "stringlib/fastsearch.h"
876#include "stringlib/partition.h"
877#include "stringlib/split.h"
878#include "stringlib/count.h"
879#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300880#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200881#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200882#include "stringlib/undef.h"
883
884#include "stringlib/ucs2lib.h"
885#include "stringlib/fastsearch.h"
886#include "stringlib/partition.h"
887#include "stringlib/split.h"
888#include "stringlib/count.h"
889#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300890#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200891#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200892#include "stringlib/undef.h"
893
894#include "stringlib/ucs4lib.h"
895#include "stringlib/fastsearch.h"
896#include "stringlib/partition.h"
897#include "stringlib/split.h"
898#include "stringlib/count.h"
899#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300900#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200901#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200902#include "stringlib/undef.h"
903
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200904#include "stringlib/unicodedefs.h"
905#include "stringlib/fastsearch.h"
906#include "stringlib/count.h"
907#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100908#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200909
Guido van Rossumd57fd912000-03-10 22:53:23 +0000910/* --- Unicode Object ----------------------------------------------------- */
911
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700912static inline Py_ssize_t
913findchar(const void *s, int kind,
914 Py_ssize_t size, Py_UCS4 ch,
915 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200916{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200917 switch (kind) {
918 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200919 if ((Py_UCS1) ch != ch)
920 return -1;
921 if (direction > 0)
922 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
923 else
924 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200925 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200926 if ((Py_UCS2) ch != ch)
927 return -1;
928 if (direction > 0)
929 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
930 else
931 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200932 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200933 if (direction > 0)
934 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
935 else
936 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200937 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700938 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200939 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200940}
941
Victor Stinnerafffce42012-10-03 23:03:17 +0200942#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000943/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200944 earlier.
945
946 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
947 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
948 invalid character in Unicode 6.0. */
949static void
950unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
951{
952 int kind = PyUnicode_KIND(unicode);
953 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
954 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
955 if (length <= old_length)
956 return;
957 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
958}
959#endif
960
Victor Stinnerfe226c02011-10-03 03:52:20 +0200961static PyObject*
962resize_compact(PyObject *unicode, Py_ssize_t length)
963{
964 Py_ssize_t char_size;
965 Py_ssize_t struct_size;
966 Py_ssize_t new_size;
967 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100968 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200969#ifdef Py_DEBUG
970 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
971#endif
972
Victor Stinner79891572012-05-03 13:43:07 +0200973 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200974 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100975 assert(PyUnicode_IS_COMPACT(unicode));
976
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200977 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100978 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200979 struct_size = sizeof(PyASCIIObject);
980 else
981 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200982 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200983
Victor Stinnerfe226c02011-10-03 03:52:20 +0200984 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
985 PyErr_NoMemory();
986 return NULL;
987 }
988 new_size = (struct_size + (length + 1) * char_size);
989
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200990 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
991 PyObject_DEL(_PyUnicode_UTF8(unicode));
992 _PyUnicode_UTF8(unicode) = NULL;
993 _PyUnicode_UTF8_LENGTH(unicode) = 0;
994 }
Victor Stinner84def372011-12-11 20:04:56 +0100995 _Py_DEC_REFTOTAL;
996 _Py_ForgetReference(unicode);
997
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300998 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100999 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001000 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 PyErr_NoMemory();
1002 return NULL;
1003 }
Victor Stinner84def372011-12-11 20:04:56 +01001004 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001005 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001006
Victor Stinnerfe226c02011-10-03 03:52:20 +02001007 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001008 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001009 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001010 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001011 _PyUnicode_WSTR_LENGTH(unicode) = length;
1012 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001013 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1014 PyObject_DEL(_PyUnicode_WSTR(unicode));
1015 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001016 if (!PyUnicode_IS_ASCII(unicode))
1017 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001018 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001019#ifdef Py_DEBUG
1020 unicode_fill_invalid(unicode, old_length);
1021#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001022 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1023 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001024 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001025 return unicode;
1026}
1027
Alexander Belopolsky40018472011-02-26 01:02:56 +00001028static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001029resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001030{
Victor Stinner95663112011-10-04 01:03:50 +02001031 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001032 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001033 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001034 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001035
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 if (PyUnicode_IS_READY(unicode)) {
1037 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001038 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001039 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001040#ifdef Py_DEBUG
1041 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1042#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001043
1044 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001045 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001046 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1047 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001048
1049 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1050 PyErr_NoMemory();
1051 return -1;
1052 }
1053 new_size = (length + 1) * char_size;
1054
Victor Stinner7a9105a2011-12-12 00:13:42 +01001055 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1056 {
1057 PyObject_DEL(_PyUnicode_UTF8(unicode));
1058 _PyUnicode_UTF8(unicode) = NULL;
1059 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1060 }
1061
Victor Stinnerfe226c02011-10-03 03:52:20 +02001062 data = (PyObject *)PyObject_REALLOC(data, new_size);
1063 if (data == NULL) {
1064 PyErr_NoMemory();
1065 return -1;
1066 }
1067 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001068 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001069 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001070 _PyUnicode_WSTR_LENGTH(unicode) = length;
1071 }
1072 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001073 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001074 _PyUnicode_UTF8_LENGTH(unicode) = length;
1075 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001076 _PyUnicode_LENGTH(unicode) = length;
1077 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001078#ifdef Py_DEBUG
1079 unicode_fill_invalid(unicode, old_length);
1080#endif
Victor Stinner95663112011-10-04 01:03:50 +02001081 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001082 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001083 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001084 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001085 }
Victor Stinner95663112011-10-04 01:03:50 +02001086 assert(_PyUnicode_WSTR(unicode) != NULL);
1087
1088 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001089 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001090 PyErr_NoMemory();
1091 return -1;
1092 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001093 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001094 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001095 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001096 if (!wstr) {
1097 PyErr_NoMemory();
1098 return -1;
1099 }
1100 _PyUnicode_WSTR(unicode) = wstr;
1101 _PyUnicode_WSTR(unicode)[length] = 0;
1102 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001103 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001104 return 0;
1105}
1106
Victor Stinnerfe226c02011-10-03 03:52:20 +02001107static PyObject*
1108resize_copy(PyObject *unicode, Py_ssize_t length)
1109{
1110 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001111 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001112 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001113
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001114 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001115
1116 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1117 if (copy == NULL)
1118 return NULL;
1119
1120 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001121 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001123 }
1124 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001125 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001126
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001127 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001128 if (w == NULL)
1129 return NULL;
1130 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1131 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001132 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001133 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001134 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001135 }
1136}
1137
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001139 Ux0000 terminated; some code (e.g. new_identifier)
1140 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001141
1142 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001143 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001144
1145*/
1146
Alexander Belopolsky40018472011-02-26 01:02:56 +00001147static PyUnicodeObject *
1148_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001149{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001150 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001151 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001152
Thomas Wouters477c8d52006-05-27 19:21:47 +00001153 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001154 if (length == 0 && unicode_empty != NULL) {
1155 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001156 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001157 }
1158
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001159 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001160 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001161 return (PyUnicodeObject *)PyErr_NoMemory();
1162 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 if (length < 0) {
1164 PyErr_SetString(PyExc_SystemError,
1165 "Negative size passed to _PyUnicode_New");
1166 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001167 }
1168
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001169 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1170 if (unicode == NULL)
1171 return NULL;
1172 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001173
1174 _PyUnicode_WSTR_LENGTH(unicode) = length;
1175 _PyUnicode_HASH(unicode) = -1;
1176 _PyUnicode_STATE(unicode).interned = 0;
1177 _PyUnicode_STATE(unicode).kind = 0;
1178 _PyUnicode_STATE(unicode).compact = 0;
1179 _PyUnicode_STATE(unicode).ready = 0;
1180 _PyUnicode_STATE(unicode).ascii = 0;
1181 _PyUnicode_DATA_ANY(unicode) = NULL;
1182 _PyUnicode_LENGTH(unicode) = 0;
1183 _PyUnicode_UTF8(unicode) = NULL;
1184 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1185
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1187 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001188 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001189 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001190 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001192
Jeremy Hyltond8082792003-09-16 19:41:39 +00001193 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001194 * the caller fails before initializing str -- unicode_resize()
1195 * reads str[0], and the Keep-Alive optimization can keep memory
1196 * allocated for str alive across a call to unicode_dealloc(unicode).
1197 * We don't want unicode_resize to read uninitialized memory in
1198 * that case.
1199 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001200 _PyUnicode_WSTR(unicode)[0] = 0;
1201 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001202
Victor Stinner7931d9a2011-11-04 00:22:48 +01001203 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001204 return unicode;
1205}
1206
Victor Stinnerf42dc442011-10-02 23:33:16 +02001207static const char*
1208unicode_kind_name(PyObject *unicode)
1209{
Victor Stinner42dfd712011-10-03 14:41:45 +02001210 /* don't check consistency: unicode_kind_name() is called from
1211 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001212 if (!PyUnicode_IS_COMPACT(unicode))
1213 {
1214 if (!PyUnicode_IS_READY(unicode))
1215 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001216 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001217 {
1218 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001219 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001220 return "legacy ascii";
1221 else
1222 return "legacy latin1";
1223 case PyUnicode_2BYTE_KIND:
1224 return "legacy UCS2";
1225 case PyUnicode_4BYTE_KIND:
1226 return "legacy UCS4";
1227 default:
1228 return "<legacy invalid kind>";
1229 }
1230 }
1231 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001232 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001233 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001234 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001235 return "ascii";
1236 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001237 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001238 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001239 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001240 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001241 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001242 default:
1243 return "<invalid compact kind>";
1244 }
1245}
1246
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001247#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001249char *_PyUnicode_utf8(void *unicode_raw){
1250 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001251 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001252}
1253
Victor Stinnera42de742018-11-22 10:25:22 +01001254void *_PyUnicode_compact_data(void *unicode_raw) {
1255 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256 return _PyUnicode_COMPACT_DATA(unicode);
1257}
Victor Stinnera42de742018-11-22 10:25:22 +01001258void *_PyUnicode_data(void *unicode_raw) {
1259 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001260 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001261 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1262 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1263 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1264 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1265 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1266 return PyUnicode_DATA(unicode);
1267}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001268
1269void
1270_PyUnicode_Dump(PyObject *op)
1271{
1272 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001273 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1274 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1275 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001276
Victor Stinnera849a4b2011-10-03 12:12:11 +02001277 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001278 {
1279 if (ascii->state.ascii)
1280 data = (ascii + 1);
1281 else
1282 data = (compact + 1);
1283 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001284 else
1285 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001286 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1287 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001288
Victor Stinnera849a4b2011-10-03 12:12:11 +02001289 if (ascii->wstr == data)
1290 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001291 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001292
Victor Stinnera3b334d2011-10-03 13:53:37 +02001293 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001294 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001295 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1296 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001297 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001298 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001299 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001300 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001301}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001302#endif
1303
1304PyObject *
1305PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1306{
1307 PyObject *obj;
1308 PyCompactUnicodeObject *unicode;
1309 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001310 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001311 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 Py_ssize_t char_size;
1313 Py_ssize_t struct_size;
1314
1315 /* Optimization for empty strings */
1316 if (size == 0 && unicode_empty != NULL) {
1317 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001318 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001319 }
1320
Victor Stinner9e9d6892011-10-04 01:02:02 +02001321 is_ascii = 0;
1322 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001323 struct_size = sizeof(PyCompactUnicodeObject);
1324 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001325 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 char_size = 1;
1327 is_ascii = 1;
1328 struct_size = sizeof(PyASCIIObject);
1329 }
1330 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001331 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001332 char_size = 1;
1333 }
1334 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001335 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001336 char_size = 2;
1337 if (sizeof(wchar_t) == 2)
1338 is_sharing = 1;
1339 }
1340 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001341 if (maxchar > MAX_UNICODE) {
1342 PyErr_SetString(PyExc_SystemError,
1343 "invalid maximum character passed to PyUnicode_New");
1344 return NULL;
1345 }
Victor Stinner8f825062012-04-27 13:55:39 +02001346 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001347 char_size = 4;
1348 if (sizeof(wchar_t) == 4)
1349 is_sharing = 1;
1350 }
1351
1352 /* Ensure we won't overflow the size. */
1353 if (size < 0) {
1354 PyErr_SetString(PyExc_SystemError,
1355 "Negative size passed to PyUnicode_New");
1356 return NULL;
1357 }
1358 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1359 return PyErr_NoMemory();
1360
1361 /* Duplicated allocation code from _PyObject_New() instead of a call to
1362 * PyObject_New() so we are able to allocate space for the object and
1363 * it's data buffer.
1364 */
1365 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1366 if (obj == NULL)
1367 return PyErr_NoMemory();
1368 obj = PyObject_INIT(obj, &PyUnicode_Type);
1369 if (obj == NULL)
1370 return NULL;
1371
1372 unicode = (PyCompactUnicodeObject *)obj;
1373 if (is_ascii)
1374 data = ((PyASCIIObject*)obj) + 1;
1375 else
1376 data = unicode + 1;
1377 _PyUnicode_LENGTH(unicode) = size;
1378 _PyUnicode_HASH(unicode) = -1;
1379 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001380 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001381 _PyUnicode_STATE(unicode).compact = 1;
1382 _PyUnicode_STATE(unicode).ready = 1;
1383 _PyUnicode_STATE(unicode).ascii = is_ascii;
1384 if (is_ascii) {
1385 ((char*)data)[size] = 0;
1386 _PyUnicode_WSTR(unicode) = NULL;
1387 }
Victor Stinner8f825062012-04-27 13:55:39 +02001388 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 ((char*)data)[size] = 0;
1390 _PyUnicode_WSTR(unicode) = NULL;
1391 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001393 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001394 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 else {
1396 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001397 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001398 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001399 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001400 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001401 ((Py_UCS4*)data)[size] = 0;
1402 if (is_sharing) {
1403 _PyUnicode_WSTR_LENGTH(unicode) = size;
1404 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1405 }
1406 else {
1407 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1408 _PyUnicode_WSTR(unicode) = NULL;
1409 }
1410 }
Victor Stinner8f825062012-04-27 13:55:39 +02001411#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001412 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001413#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001414 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001415 return obj;
1416}
1417
1418#if SIZEOF_WCHAR_T == 2
1419/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1420 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001421 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422
1423 This function assumes that unicode can hold one more code point than wstr
1424 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001425static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001426unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001427 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428{
1429 const wchar_t *iter;
1430 Py_UCS4 *ucs4_out;
1431
Victor Stinner910337b2011-10-03 03:20:16 +02001432 assert(unicode != NULL);
1433 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001434 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1435 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1436
1437 for (iter = begin; iter < end; ) {
1438 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1439 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001440 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1441 && (iter+1) < end
1442 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001443 {
Victor Stinner551ac952011-11-29 22:58:13 +01001444 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 iter += 2;
1446 }
1447 else {
1448 *ucs4_out++ = *iter;
1449 iter++;
1450 }
1451 }
1452 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1453 _PyUnicode_GET_LENGTH(unicode)));
1454
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455}
1456#endif
1457
Victor Stinnercd9950f2011-10-02 00:34:53 +02001458static int
Victor Stinner488fa492011-12-12 00:01:39 +01001459unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001460{
Victor Stinner488fa492011-12-12 00:01:39 +01001461 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001462 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001463 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001464 return -1;
1465 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001466 return 0;
1467}
1468
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001469static int
1470_copy_characters(PyObject *to, Py_ssize_t to_start,
1471 PyObject *from, Py_ssize_t from_start,
1472 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001473{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001474 unsigned int from_kind, to_kind;
1475 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001476
Victor Stinneree4544c2012-05-09 22:24:08 +02001477 assert(0 <= how_many);
1478 assert(0 <= from_start);
1479 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001480 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001481 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001482 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001483
Victor Stinnerd3f08822012-05-29 12:57:52 +02001484 assert(PyUnicode_Check(to));
1485 assert(PyUnicode_IS_READY(to));
1486 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1487
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001488 if (how_many == 0)
1489 return 0;
1490
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001491 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001492 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001493 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001494 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001495
Victor Stinnerf1852262012-06-16 16:38:26 +02001496#ifdef Py_DEBUG
1497 if (!check_maxchar
1498 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1499 {
1500 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1501 Py_UCS4 ch;
1502 Py_ssize_t i;
1503 for (i=0; i < how_many; i++) {
1504 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1505 assert(ch <= to_maxchar);
1506 }
1507 }
1508#endif
1509
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001510 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001511 if (check_maxchar
1512 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1513 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001514 /* Writing Latin-1 characters into an ASCII string requires to
1515 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001516 Py_UCS4 max_char;
1517 max_char = ucs1lib_find_max_char(from_data,
1518 (Py_UCS1*)from_data + how_many);
1519 if (max_char >= 128)
1520 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001521 }
Christian Heimesf051e432016-09-13 20:22:02 +02001522 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001523 (char*)from_data + from_kind * from_start,
1524 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001525 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001526 else if (from_kind == PyUnicode_1BYTE_KIND
1527 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001528 {
1529 _PyUnicode_CONVERT_BYTES(
1530 Py_UCS1, Py_UCS2,
1531 PyUnicode_1BYTE_DATA(from) + from_start,
1532 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1533 PyUnicode_2BYTE_DATA(to) + to_start
1534 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001535 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001536 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001537 && to_kind == PyUnicode_4BYTE_KIND)
1538 {
1539 _PyUnicode_CONVERT_BYTES(
1540 Py_UCS1, Py_UCS4,
1541 PyUnicode_1BYTE_DATA(from) + from_start,
1542 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1543 PyUnicode_4BYTE_DATA(to) + to_start
1544 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001545 }
1546 else if (from_kind == PyUnicode_2BYTE_KIND
1547 && to_kind == PyUnicode_4BYTE_KIND)
1548 {
1549 _PyUnicode_CONVERT_BYTES(
1550 Py_UCS2, Py_UCS4,
1551 PyUnicode_2BYTE_DATA(from) + from_start,
1552 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1553 PyUnicode_4BYTE_DATA(to) + to_start
1554 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001555 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001556 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001557 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1558
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001559 if (!check_maxchar) {
1560 if (from_kind == PyUnicode_2BYTE_KIND
1561 && to_kind == PyUnicode_1BYTE_KIND)
1562 {
1563 _PyUnicode_CONVERT_BYTES(
1564 Py_UCS2, Py_UCS1,
1565 PyUnicode_2BYTE_DATA(from) + from_start,
1566 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1567 PyUnicode_1BYTE_DATA(to) + to_start
1568 );
1569 }
1570 else if (from_kind == PyUnicode_4BYTE_KIND
1571 && to_kind == PyUnicode_1BYTE_KIND)
1572 {
1573 _PyUnicode_CONVERT_BYTES(
1574 Py_UCS4, Py_UCS1,
1575 PyUnicode_4BYTE_DATA(from) + from_start,
1576 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1577 PyUnicode_1BYTE_DATA(to) + to_start
1578 );
1579 }
1580 else if (from_kind == PyUnicode_4BYTE_KIND
1581 && to_kind == PyUnicode_2BYTE_KIND)
1582 {
1583 _PyUnicode_CONVERT_BYTES(
1584 Py_UCS4, Py_UCS2,
1585 PyUnicode_4BYTE_DATA(from) + from_start,
1586 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1587 PyUnicode_2BYTE_DATA(to) + to_start
1588 );
1589 }
1590 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001591 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001592 }
1593 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001594 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001595 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001596 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001597 Py_ssize_t i;
1598
Victor Stinnera0702ab2011-09-29 14:14:38 +02001599 for (i=0; i < how_many; i++) {
1600 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001601 if (ch > to_maxchar)
1602 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001603 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1604 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001605 }
1606 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001607 return 0;
1608}
1609
Victor Stinnerd3f08822012-05-29 12:57:52 +02001610void
1611_PyUnicode_FastCopyCharacters(
1612 PyObject *to, Py_ssize_t to_start,
1613 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001614{
1615 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1616}
1617
1618Py_ssize_t
1619PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1620 PyObject *from, Py_ssize_t from_start,
1621 Py_ssize_t how_many)
1622{
1623 int err;
1624
1625 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1626 PyErr_BadInternalCall();
1627 return -1;
1628 }
1629
Benjamin Petersonbac79492012-01-14 13:34:47 -05001630 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001631 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001632 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001633 return -1;
1634
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001635 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001636 PyErr_SetString(PyExc_IndexError, "string index out of range");
1637 return -1;
1638 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001639 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001640 PyErr_SetString(PyExc_IndexError, "string index out of range");
1641 return -1;
1642 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001643 if (how_many < 0) {
1644 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1645 return -1;
1646 }
1647 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001648 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1649 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001650 "Cannot write %zi characters at %zi "
1651 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001652 how_many, to_start, PyUnicode_GET_LENGTH(to));
1653 return -1;
1654 }
1655
1656 if (how_many == 0)
1657 return 0;
1658
Victor Stinner488fa492011-12-12 00:01:39 +01001659 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001660 return -1;
1661
1662 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1663 if (err) {
1664 PyErr_Format(PyExc_SystemError,
1665 "Cannot copy %s characters "
1666 "into a string of %s characters",
1667 unicode_kind_name(from),
1668 unicode_kind_name(to));
1669 return -1;
1670 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001671 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001672}
1673
Victor Stinner17222162011-09-28 22:15:37 +02001674/* Find the maximum code point and count the number of surrogate pairs so a
1675 correct string length can be computed before converting a string to UCS4.
1676 This function counts single surrogates as a character and not as a pair.
1677
1678 Return 0 on success, or -1 on error. */
1679static int
1680find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1681 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001682{
1683 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001684 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685
Victor Stinnerc53be962011-10-02 21:33:54 +02001686 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001687 *num_surrogates = 0;
1688 *maxchar = 0;
1689
1690 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001691#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001692 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1693 && (iter+1) < end
1694 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1695 {
1696 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1697 ++(*num_surrogates);
1698 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001699 }
1700 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001701#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001702 {
1703 ch = *iter;
1704 iter++;
1705 }
1706 if (ch > *maxchar) {
1707 *maxchar = ch;
1708 if (*maxchar > MAX_UNICODE) {
1709 PyErr_Format(PyExc_ValueError,
1710 "character U+%x is not in range [U+0000; U+10ffff]",
1711 ch);
1712 return -1;
1713 }
1714 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715 }
1716 return 0;
1717}
1718
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001719int
1720_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001721{
1722 wchar_t *end;
1723 Py_UCS4 maxchar = 0;
1724 Py_ssize_t num_surrogates;
1725#if SIZEOF_WCHAR_T == 2
1726 Py_ssize_t length_wo_surrogates;
1727#endif
1728
Georg Brandl7597add2011-10-05 16:36:47 +02001729 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001730 strings were created using _PyObject_New() and where no canonical
1731 representation (the str field) has been set yet aka strings
1732 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001733 assert(_PyUnicode_CHECK(unicode));
1734 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001736 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001737 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001738 /* Actually, it should neither be interned nor be anything else: */
1739 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001742 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001743 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001744 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001745
1746 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001747 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1748 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001749 PyErr_NoMemory();
1750 return -1;
1751 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001752 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001753 _PyUnicode_WSTR(unicode), end,
1754 PyUnicode_1BYTE_DATA(unicode));
1755 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1756 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1757 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1758 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001759 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001760 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001761 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001762 }
1763 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001764 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001765 _PyUnicode_UTF8(unicode) = NULL;
1766 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001767 }
1768 PyObject_FREE(_PyUnicode_WSTR(unicode));
1769 _PyUnicode_WSTR(unicode) = NULL;
1770 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1771 }
1772 /* In this case we might have to convert down from 4-byte native
1773 wchar_t to 2-byte unicode. */
1774 else if (maxchar < 65536) {
1775 assert(num_surrogates == 0 &&
1776 "FindMaxCharAndNumSurrogatePairs() messed up");
1777
Victor Stinner506f5922011-09-28 22:34:18 +02001778#if SIZEOF_WCHAR_T == 2
1779 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001780 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001781 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1782 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1783 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001784 _PyUnicode_UTF8(unicode) = NULL;
1785 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001786#else
1787 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001788 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001789 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001790 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001791 PyErr_NoMemory();
1792 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001793 }
Victor Stinner506f5922011-09-28 22:34:18 +02001794 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1795 _PyUnicode_WSTR(unicode), end,
1796 PyUnicode_2BYTE_DATA(unicode));
1797 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1798 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1799 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001800 _PyUnicode_UTF8(unicode) = NULL;
1801 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001802 PyObject_FREE(_PyUnicode_WSTR(unicode));
1803 _PyUnicode_WSTR(unicode) = NULL;
1804 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1805#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001806 }
1807 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1808 else {
1809#if SIZEOF_WCHAR_T == 2
1810 /* in case the native representation is 2-bytes, we need to allocate a
1811 new normalized 4-byte version. */
1812 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001813 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1814 PyErr_NoMemory();
1815 return -1;
1816 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001817 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1818 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001819 PyErr_NoMemory();
1820 return -1;
1821 }
1822 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1823 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001824 _PyUnicode_UTF8(unicode) = NULL;
1825 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001826 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1827 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001828 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001829 PyObject_FREE(_PyUnicode_WSTR(unicode));
1830 _PyUnicode_WSTR(unicode) = NULL;
1831 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1832#else
1833 assert(num_surrogates == 0);
1834
Victor Stinnerc3c74152011-10-02 20:39:55 +02001835 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001836 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001837 _PyUnicode_UTF8(unicode) = NULL;
1838 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1840#endif
1841 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1842 }
1843 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001844 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001845 return 0;
1846}
1847
Alexander Belopolsky40018472011-02-26 01:02:56 +00001848static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001849unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001850{
Walter Dörwald16807132007-05-25 13:52:07 +00001851 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001852 case SSTATE_NOT_INTERNED:
1853 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001854
Benjamin Peterson29060642009-01-31 22:14:21 +00001855 case SSTATE_INTERNED_MORTAL:
1856 /* revive dead object temporarily for DelItem */
1857 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001858 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001859 Py_FatalError(
1860 "deletion of interned string failed");
1861 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001862
Benjamin Peterson29060642009-01-31 22:14:21 +00001863 case SSTATE_INTERNED_IMMORTAL:
1864 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001865 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001866
Benjamin Peterson29060642009-01-31 22:14:21 +00001867 default:
1868 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001869 }
1870
Victor Stinner03490912011-10-03 23:45:12 +02001871 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001872 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001873 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001874 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001875 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1876 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001877
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001878 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001879}
1880
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001881#ifdef Py_DEBUG
1882static int
1883unicode_is_singleton(PyObject *unicode)
1884{
1885 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1886 if (unicode == unicode_empty)
1887 return 1;
1888 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1889 {
1890 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1891 if (ch < 256 && unicode_latin1[ch] == unicode)
1892 return 1;
1893 }
1894 return 0;
1895}
1896#endif
1897
Alexander Belopolsky40018472011-02-26 01:02:56 +00001898static int
Victor Stinner488fa492011-12-12 00:01:39 +01001899unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001900{
Victor Stinner488fa492011-12-12 00:01:39 +01001901 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001902 if (Py_REFCNT(unicode) != 1)
1903 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001904 if (_PyUnicode_HASH(unicode) != -1)
1905 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001906 if (PyUnicode_CHECK_INTERNED(unicode))
1907 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001908 if (!PyUnicode_CheckExact(unicode))
1909 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001910#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001911 /* singleton refcount is greater than 1 */
1912 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001913#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001914 return 1;
1915}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001916
Victor Stinnerfe226c02011-10-03 03:52:20 +02001917static int
1918unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1919{
1920 PyObject *unicode;
1921 Py_ssize_t old_length;
1922
1923 assert(p_unicode != NULL);
1924 unicode = *p_unicode;
1925
1926 assert(unicode != NULL);
1927 assert(PyUnicode_Check(unicode));
1928 assert(0 <= length);
1929
Victor Stinner910337b2011-10-03 03:20:16 +02001930 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001931 old_length = PyUnicode_WSTR_LENGTH(unicode);
1932 else
1933 old_length = PyUnicode_GET_LENGTH(unicode);
1934 if (old_length == length)
1935 return 0;
1936
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001937 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001938 _Py_INCREF_UNICODE_EMPTY();
1939 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001940 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001941 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001942 return 0;
1943 }
1944
Victor Stinner488fa492011-12-12 00:01:39 +01001945 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001946 PyObject *copy = resize_copy(unicode, length);
1947 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001948 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001949 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001950 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001951 }
1952
Victor Stinnerfe226c02011-10-03 03:52:20 +02001953 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001954 PyObject *new_unicode = resize_compact(unicode, length);
1955 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001956 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001957 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001958 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001959 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001960 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001961}
1962
Alexander Belopolsky40018472011-02-26 01:02:56 +00001963int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001964PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001965{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001966 PyObject *unicode;
1967 if (p_unicode == NULL) {
1968 PyErr_BadInternalCall();
1969 return -1;
1970 }
1971 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001972 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001973 {
1974 PyErr_BadInternalCall();
1975 return -1;
1976 }
1977 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001978}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001979
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001980/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001981
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001982 WARNING: The function doesn't copy the terminating null character and
1983 doesn't check the maximum character (may write a latin1 character in an
1984 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001985static void
1986unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1987 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001988{
1989 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1990 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001991 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001992
1993 switch (kind) {
1994 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001995 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001996#ifdef Py_DEBUG
1997 if (PyUnicode_IS_ASCII(unicode)) {
1998 Py_UCS4 maxchar = ucs1lib_find_max_char(
1999 (const Py_UCS1*)str,
2000 (const Py_UCS1*)str + len);
2001 assert(maxchar < 128);
2002 }
2003#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002004 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002005 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002006 }
2007 case PyUnicode_2BYTE_KIND: {
2008 Py_UCS2 *start = (Py_UCS2 *)data + index;
2009 Py_UCS2 *ucs2 = start;
2010 assert(index <= PyUnicode_GET_LENGTH(unicode));
2011
Victor Stinner184252a2012-06-16 02:57:41 +02002012 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002013 *ucs2 = (Py_UCS2)*str;
2014
2015 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002016 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002017 }
2018 default: {
2019 Py_UCS4 *start = (Py_UCS4 *)data + index;
2020 Py_UCS4 *ucs4 = start;
2021 assert(kind == PyUnicode_4BYTE_KIND);
2022 assert(index <= PyUnicode_GET_LENGTH(unicode));
2023
Victor Stinner184252a2012-06-16 02:57:41 +02002024 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002025 *ucs4 = (Py_UCS4)*str;
2026
2027 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002028 }
2029 }
2030}
2031
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032static PyObject*
2033get_latin1_char(unsigned char ch)
2034{
Victor Stinnera464fc12011-10-02 20:39:30 +02002035 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002037 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002038 if (!unicode)
2039 return NULL;
2040 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002041 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002042 unicode_latin1[ch] = unicode;
2043 }
2044 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002045 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002046}
2047
Victor Stinner985a82a2014-01-03 12:53:47 +01002048static PyObject*
2049unicode_char(Py_UCS4 ch)
2050{
2051 PyObject *unicode;
2052
2053 assert(ch <= MAX_UNICODE);
2054
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002055 if (ch < 256)
2056 return get_latin1_char(ch);
2057
Victor Stinner985a82a2014-01-03 12:53:47 +01002058 unicode = PyUnicode_New(1, ch);
2059 if (unicode == NULL)
2060 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002061
2062 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2063 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002064 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002065 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002066 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2067 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2068 }
2069 assert(_PyUnicode_CheckConsistency(unicode, 1));
2070 return unicode;
2071}
2072
Alexander Belopolsky40018472011-02-26 01:02:56 +00002073PyObject *
2074PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002075{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002076 if (u == NULL)
2077 return (PyObject*)_PyUnicode_New(size);
2078
2079 if (size < 0) {
2080 PyErr_BadInternalCall();
2081 return NULL;
2082 }
2083
2084 return PyUnicode_FromWideChar(u, size);
2085}
2086
2087PyObject *
2088PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2089{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002090 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002091 Py_UCS4 maxchar = 0;
2092 Py_ssize_t num_surrogates;
2093
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002094 if (u == NULL && size != 0) {
2095 PyErr_BadInternalCall();
2096 return NULL;
2097 }
2098
2099 if (size == -1) {
2100 size = wcslen(u);
2101 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002102
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002103 /* If the Unicode data is known at construction time, we can apply
2104 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002105
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002107 if (size == 0)
2108 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002109
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002110 /* Single character Unicode objects in the Latin-1 range are
2111 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002112 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 return get_latin1_char((unsigned char)*u);
2114
2115 /* If not empty and not single character, copy the Unicode data
2116 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002117 if (find_maxchar_surrogates(u, u + size,
2118 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 return NULL;
2120
Victor Stinner8faf8212011-12-08 22:14:11 +01002121 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002122 if (!unicode)
2123 return NULL;
2124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002125 switch (PyUnicode_KIND(unicode)) {
2126 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002127 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002128 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2129 break;
2130 case PyUnicode_2BYTE_KIND:
2131#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002132 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002133#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002134 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002135 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2136#endif
2137 break;
2138 case PyUnicode_4BYTE_KIND:
2139#if SIZEOF_WCHAR_T == 2
2140 /* This is the only case which has to process surrogates, thus
2141 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002142 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002143#else
2144 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002145 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002146#endif
2147 break;
2148 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002149 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002150 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002151
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002152 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002153}
2154
Alexander Belopolsky40018472011-02-26 01:02:56 +00002155PyObject *
2156PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002157{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002158 if (size < 0) {
2159 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002160 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002161 return NULL;
2162 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002163 if (u != NULL)
2164 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2165 else
2166 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002167}
2168
Alexander Belopolsky40018472011-02-26 01:02:56 +00002169PyObject *
2170PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002171{
2172 size_t size = strlen(u);
2173 if (size > PY_SSIZE_T_MAX) {
2174 PyErr_SetString(PyExc_OverflowError, "input too long");
2175 return NULL;
2176 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002177 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002178}
2179
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002180PyObject *
2181_PyUnicode_FromId(_Py_Identifier *id)
2182{
2183 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002184 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2185 strlen(id->string),
2186 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002187 if (!id->object)
2188 return NULL;
2189 PyUnicode_InternInPlace(&id->object);
2190 assert(!id->next);
2191 id->next = static_strings;
2192 static_strings = id;
2193 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002194 return id->object;
2195}
2196
2197void
2198_PyUnicode_ClearStaticStrings()
2199{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002200 _Py_Identifier *tmp, *s = static_strings;
2201 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002202 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002203 tmp = s->next;
2204 s->next = NULL;
2205 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002206 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002207 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002208}
2209
Benjamin Peterson0df54292012-03-26 14:50:32 -04002210/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002211
Victor Stinnerd3f08822012-05-29 12:57:52 +02002212PyObject*
2213_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002214{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002215 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002216 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002217 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002218#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002219 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002220#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002221 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002222 }
Victor Stinner785938e2011-12-11 20:09:03 +01002223 unicode = PyUnicode_New(size, 127);
2224 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002225 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002226 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2227 assert(_PyUnicode_CheckConsistency(unicode, 1));
2228 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002229}
2230
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002231static Py_UCS4
2232kind_maxchar_limit(unsigned int kind)
2233{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002234 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002235 case PyUnicode_1BYTE_KIND:
2236 return 0x80;
2237 case PyUnicode_2BYTE_KIND:
2238 return 0x100;
2239 case PyUnicode_4BYTE_KIND:
2240 return 0x10000;
2241 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002242 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002243 }
2244}
2245
Victor Stinner702c7342011-10-05 13:50:52 +02002246static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002247_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002249 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002250 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002251
Serhiy Storchaka678db842013-01-26 12:16:36 +02002252 if (size == 0)
2253 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002254 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002255 if (size == 1)
2256 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002257
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002258 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002259 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002260 if (!res)
2261 return NULL;
2262 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002263 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002264 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002265}
2266
Victor Stinnere57b1c02011-09-28 22:20:48 +02002267static PyObject*
2268_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002269{
2270 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002271 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002272
Serhiy Storchaka678db842013-01-26 12:16:36 +02002273 if (size == 0)
2274 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002275 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002276 if (size == 1)
2277 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002278
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002279 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002280 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002281 if (!res)
2282 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002283 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002285 else {
2286 _PyUnicode_CONVERT_BYTES(
2287 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2288 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002289 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002290 return res;
2291}
2292
Victor Stinnere57b1c02011-09-28 22:20:48 +02002293static PyObject*
2294_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002295{
2296 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002297 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002298
Serhiy Storchaka678db842013-01-26 12:16:36 +02002299 if (size == 0)
2300 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002301 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002302 if (size == 1)
2303 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002304
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002305 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002306 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002307 if (!res)
2308 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002309 if (max_char < 256)
2310 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2311 PyUnicode_1BYTE_DATA(res));
2312 else if (max_char < 0x10000)
2313 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2314 PyUnicode_2BYTE_DATA(res));
2315 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002316 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002317 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002318 return res;
2319}
2320
2321PyObject*
2322PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2323{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002324 if (size < 0) {
2325 PyErr_SetString(PyExc_ValueError, "size must be positive");
2326 return NULL;
2327 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002328 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002330 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002331 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002332 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002334 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002335 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002336 PyErr_SetString(PyExc_SystemError, "invalid kind");
2337 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002338 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002339}
2340
Victor Stinnerece58de2012-04-23 23:36:38 +02002341Py_UCS4
2342_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2343{
2344 enum PyUnicode_Kind kind;
2345 void *startptr, *endptr;
2346
2347 assert(PyUnicode_IS_READY(unicode));
2348 assert(0 <= start);
2349 assert(end <= PyUnicode_GET_LENGTH(unicode));
2350 assert(start <= end);
2351
2352 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2353 return PyUnicode_MAX_CHAR_VALUE(unicode);
2354
2355 if (start == end)
2356 return 127;
2357
Victor Stinner94d558b2012-04-27 22:26:58 +02002358 if (PyUnicode_IS_ASCII(unicode))
2359 return 127;
2360
Victor Stinnerece58de2012-04-23 23:36:38 +02002361 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002362 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002363 endptr = (char *)startptr + end * kind;
2364 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002365 switch(kind) {
2366 case PyUnicode_1BYTE_KIND:
2367 return ucs1lib_find_max_char(startptr, endptr);
2368 case PyUnicode_2BYTE_KIND:
2369 return ucs2lib_find_max_char(startptr, endptr);
2370 case PyUnicode_4BYTE_KIND:
2371 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002372 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002373 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002374 }
2375}
2376
Victor Stinner25a4b292011-10-06 12:31:55 +02002377/* Ensure that a string uses the most efficient storage, if it is not the
2378 case: create a new string with of the right kind. Write NULL into *p_unicode
2379 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002380static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002381unicode_adjust_maxchar(PyObject **p_unicode)
2382{
2383 PyObject *unicode, *copy;
2384 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002385 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002386 unsigned int kind;
2387
2388 assert(p_unicode != NULL);
2389 unicode = *p_unicode;
2390 assert(PyUnicode_IS_READY(unicode));
2391 if (PyUnicode_IS_ASCII(unicode))
2392 return;
2393
2394 len = PyUnicode_GET_LENGTH(unicode);
2395 kind = PyUnicode_KIND(unicode);
2396 if (kind == PyUnicode_1BYTE_KIND) {
2397 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002398 max_char = ucs1lib_find_max_char(u, u + len);
2399 if (max_char >= 128)
2400 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002401 }
2402 else if (kind == PyUnicode_2BYTE_KIND) {
2403 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002404 max_char = ucs2lib_find_max_char(u, u + len);
2405 if (max_char >= 256)
2406 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002407 }
2408 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002409 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002410 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002411 max_char = ucs4lib_find_max_char(u, u + len);
2412 if (max_char >= 0x10000)
2413 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002414 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002415 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002416 if (copy != NULL)
2417 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002418 Py_DECREF(unicode);
2419 *p_unicode = copy;
2420}
2421
Victor Stinner034f6cf2011-09-30 02:26:44 +02002422PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002423_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002424{
Victor Stinner87af4f22011-11-21 23:03:47 +01002425 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002426 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002427
Victor Stinner034f6cf2011-09-30 02:26:44 +02002428 if (!PyUnicode_Check(unicode)) {
2429 PyErr_BadInternalCall();
2430 return NULL;
2431 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002432 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002433 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002434
Victor Stinner87af4f22011-11-21 23:03:47 +01002435 length = PyUnicode_GET_LENGTH(unicode);
2436 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002437 if (!copy)
2438 return NULL;
2439 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2440
Christian Heimesf051e432016-09-13 20:22:02 +02002441 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002442 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002443 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002444 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002445}
2446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002447
Victor Stinnerbc603d12011-10-02 01:00:40 +02002448/* Widen Unicode objects to larger buffers. Don't write terminating null
2449 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002450
2451void*
2452_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2453{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002454 Py_ssize_t len;
2455 void *result;
2456 unsigned int skind;
2457
Benjamin Petersonbac79492012-01-14 13:34:47 -05002458 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002459 return NULL;
2460
2461 len = PyUnicode_GET_LENGTH(s);
2462 skind = PyUnicode_KIND(s);
2463 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002464 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002465 return NULL;
2466 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002467 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002468 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002469 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002470 if (!result)
2471 return PyErr_NoMemory();
2472 assert(skind == PyUnicode_1BYTE_KIND);
2473 _PyUnicode_CONVERT_BYTES(
2474 Py_UCS1, Py_UCS2,
2475 PyUnicode_1BYTE_DATA(s),
2476 PyUnicode_1BYTE_DATA(s) + len,
2477 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002478 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002479 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002480 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002481 if (!result)
2482 return PyErr_NoMemory();
2483 if (skind == PyUnicode_2BYTE_KIND) {
2484 _PyUnicode_CONVERT_BYTES(
2485 Py_UCS2, Py_UCS4,
2486 PyUnicode_2BYTE_DATA(s),
2487 PyUnicode_2BYTE_DATA(s) + len,
2488 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002489 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002490 else {
2491 assert(skind == PyUnicode_1BYTE_KIND);
2492 _PyUnicode_CONVERT_BYTES(
2493 Py_UCS1, Py_UCS4,
2494 PyUnicode_1BYTE_DATA(s),
2495 PyUnicode_1BYTE_DATA(s) + len,
2496 result);
2497 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002498 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002499 default:
2500 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002501 }
Victor Stinner01698042011-10-04 00:04:26 +02002502 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002503 return NULL;
2504}
2505
2506static Py_UCS4*
2507as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2508 int copy_null)
2509{
2510 int kind;
2511 void *data;
2512 Py_ssize_t len, targetlen;
2513 if (PyUnicode_READY(string) == -1)
2514 return NULL;
2515 kind = PyUnicode_KIND(string);
2516 data = PyUnicode_DATA(string);
2517 len = PyUnicode_GET_LENGTH(string);
2518 targetlen = len;
2519 if (copy_null)
2520 targetlen++;
2521 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002522 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002523 if (!target) {
2524 PyErr_NoMemory();
2525 return NULL;
2526 }
2527 }
2528 else {
2529 if (targetsize < targetlen) {
2530 PyErr_Format(PyExc_SystemError,
2531 "string is longer than the buffer");
2532 if (copy_null && 0 < targetsize)
2533 target[0] = 0;
2534 return NULL;
2535 }
2536 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002537 if (kind == PyUnicode_1BYTE_KIND) {
2538 Py_UCS1 *start = (Py_UCS1 *) data;
2539 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002540 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002541 else if (kind == PyUnicode_2BYTE_KIND) {
2542 Py_UCS2 *start = (Py_UCS2 *) data;
2543 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2544 }
2545 else {
2546 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002547 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002548 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002549 if (copy_null)
2550 target[len] = 0;
2551 return target;
2552}
2553
2554Py_UCS4*
2555PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2556 int copy_null)
2557{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002558 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002559 PyErr_BadInternalCall();
2560 return NULL;
2561 }
2562 return as_ucs4(string, target, targetsize, copy_null);
2563}
2564
2565Py_UCS4*
2566PyUnicode_AsUCS4Copy(PyObject *string)
2567{
2568 return as_ucs4(string, NULL, 0, 1);
2569}
2570
Victor Stinner15a11362012-10-06 23:48:20 +02002571/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002572 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2573 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2574#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002575
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002576static int
2577unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2578 Py_ssize_t width, Py_ssize_t precision)
2579{
2580 Py_ssize_t length, fill, arglen;
2581 Py_UCS4 maxchar;
2582
2583 if (PyUnicode_READY(str) == -1)
2584 return -1;
2585
2586 length = PyUnicode_GET_LENGTH(str);
2587 if ((precision == -1 || precision >= length)
2588 && width <= length)
2589 return _PyUnicodeWriter_WriteStr(writer, str);
2590
2591 if (precision != -1)
2592 length = Py_MIN(precision, length);
2593
2594 arglen = Py_MAX(length, width);
2595 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2596 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2597 else
2598 maxchar = writer->maxchar;
2599
2600 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2601 return -1;
2602
2603 if (width > length) {
2604 fill = width - length;
2605 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2606 return -1;
2607 writer->pos += fill;
2608 }
2609
2610 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2611 str, 0, length);
2612 writer->pos += length;
2613 return 0;
2614}
2615
2616static int
Victor Stinner998b8062018-09-12 00:23:25 +02002617unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002618 Py_ssize_t width, Py_ssize_t precision)
2619{
2620 /* UTF-8 */
2621 Py_ssize_t length;
2622 PyObject *unicode;
2623 int res;
2624
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002625 if (precision == -1) {
2626 length = strlen(str);
2627 }
2628 else {
2629 length = 0;
2630 while (length < precision && str[length]) {
2631 length++;
2632 }
2633 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002634 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2635 if (unicode == NULL)
2636 return -1;
2637
2638 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2639 Py_DECREF(unicode);
2640 return res;
2641}
2642
Victor Stinner96865452011-03-01 23:44:09 +00002643static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002644unicode_fromformat_arg(_PyUnicodeWriter *writer,
2645 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002646{
Victor Stinnere215d962012-10-06 23:03:36 +02002647 const char *p;
2648 Py_ssize_t len;
2649 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002650 Py_ssize_t width;
2651 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002652 int longflag;
2653 int longlongflag;
2654 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002655 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002656
2657 p = f;
2658 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002659 zeropad = 0;
2660 if (*f == '0') {
2661 zeropad = 1;
2662 f++;
2663 }
Victor Stinner96865452011-03-01 23:44:09 +00002664
2665 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002666 width = -1;
2667 if (Py_ISDIGIT((unsigned)*f)) {
2668 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002669 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002670 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002671 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002672 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002673 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002674 return NULL;
2675 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002676 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002677 f++;
2678 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002679 }
2680 precision = -1;
2681 if (*f == '.') {
2682 f++;
2683 if (Py_ISDIGIT((unsigned)*f)) {
2684 precision = (*f - '0');
2685 f++;
2686 while (Py_ISDIGIT((unsigned)*f)) {
2687 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2688 PyErr_SetString(PyExc_ValueError,
2689 "precision too big");
2690 return NULL;
2691 }
2692 precision = (precision * 10) + (*f - '0');
2693 f++;
2694 }
2695 }
Victor Stinner96865452011-03-01 23:44:09 +00002696 if (*f == '%') {
2697 /* "%.3%s" => f points to "3" */
2698 f--;
2699 }
2700 }
2701 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002702 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002703 f--;
2704 }
Victor Stinner96865452011-03-01 23:44:09 +00002705
2706 /* Handle %ld, %lu, %lld and %llu. */
2707 longflag = 0;
2708 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002709 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002710 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002711 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002712 longflag = 1;
2713 ++f;
2714 }
Victor Stinner96865452011-03-01 23:44:09 +00002715 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002716 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002717 longlongflag = 1;
2718 f += 2;
2719 }
Victor Stinner96865452011-03-01 23:44:09 +00002720 }
2721 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002722 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002723 size_tflag = 1;
2724 ++f;
2725 }
Victor Stinnere215d962012-10-06 23:03:36 +02002726
2727 if (f[1] == '\0')
2728 writer->overallocate = 0;
2729
2730 switch (*f) {
2731 case 'c':
2732 {
2733 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002734 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002735 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002736 "character argument not in range(0x110000)");
2737 return NULL;
2738 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002739 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002740 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002741 break;
2742 }
2743
2744 case 'i':
2745 case 'd':
2746 case 'u':
2747 case 'x':
2748 {
2749 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002750 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002751 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002752
2753 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002754 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002755 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002756 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002757 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002758 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002759 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002760 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002761 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002762 va_arg(*vargs, size_t));
2763 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002764 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002765 va_arg(*vargs, unsigned int));
2766 }
2767 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002768 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002769 }
2770 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002771 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002772 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002773 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002774 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002775 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002776 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002777 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002778 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002779 va_arg(*vargs, Py_ssize_t));
2780 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002781 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002782 va_arg(*vargs, int));
2783 }
2784 assert(len >= 0);
2785
Victor Stinnere215d962012-10-06 23:03:36 +02002786 if (precision < len)
2787 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002788
2789 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002790 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2791 return NULL;
2792
Victor Stinnere215d962012-10-06 23:03:36 +02002793 if (width > precision) {
2794 Py_UCS4 fillchar;
2795 fill = width - precision;
2796 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002797 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2798 return NULL;
2799 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002800 }
Victor Stinner15a11362012-10-06 23:48:20 +02002801 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002802 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002803 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2804 return NULL;
2805 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002806 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002807
Victor Stinner4a587072013-11-19 12:54:53 +01002808 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2809 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002810 break;
2811 }
2812
2813 case 'p':
2814 {
2815 char number[MAX_LONG_LONG_CHARS];
2816
2817 len = sprintf(number, "%p", va_arg(*vargs, void*));
2818 assert(len >= 0);
2819
2820 /* %p is ill-defined: ensure leading 0x. */
2821 if (number[1] == 'X')
2822 number[1] = 'x';
2823 else if (number[1] != 'x') {
2824 memmove(number + 2, number,
2825 strlen(number) + 1);
2826 number[0] = '0';
2827 number[1] = 'x';
2828 len += 2;
2829 }
2830
Victor Stinner4a587072013-11-19 12:54:53 +01002831 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002832 return NULL;
2833 break;
2834 }
2835
2836 case 's':
2837 {
2838 /* UTF-8 */
2839 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002840 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002841 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002842 break;
2843 }
2844
2845 case 'U':
2846 {
2847 PyObject *obj = va_arg(*vargs, PyObject *);
2848 assert(obj && _PyUnicode_CHECK(obj));
2849
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002850 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002851 return NULL;
2852 break;
2853 }
2854
2855 case 'V':
2856 {
2857 PyObject *obj = va_arg(*vargs, PyObject *);
2858 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002859 if (obj) {
2860 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002861 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002862 return NULL;
2863 }
2864 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002865 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002866 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002867 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002868 }
2869 break;
2870 }
2871
2872 case 'S':
2873 {
2874 PyObject *obj = va_arg(*vargs, PyObject *);
2875 PyObject *str;
2876 assert(obj);
2877 str = PyObject_Str(obj);
2878 if (!str)
2879 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002880 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002881 Py_DECREF(str);
2882 return NULL;
2883 }
2884 Py_DECREF(str);
2885 break;
2886 }
2887
2888 case 'R':
2889 {
2890 PyObject *obj = va_arg(*vargs, PyObject *);
2891 PyObject *repr;
2892 assert(obj);
2893 repr = PyObject_Repr(obj);
2894 if (!repr)
2895 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002896 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002897 Py_DECREF(repr);
2898 return NULL;
2899 }
2900 Py_DECREF(repr);
2901 break;
2902 }
2903
2904 case 'A':
2905 {
2906 PyObject *obj = va_arg(*vargs, PyObject *);
2907 PyObject *ascii;
2908 assert(obj);
2909 ascii = PyObject_ASCII(obj);
2910 if (!ascii)
2911 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002912 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002913 Py_DECREF(ascii);
2914 return NULL;
2915 }
2916 Py_DECREF(ascii);
2917 break;
2918 }
2919
2920 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002921 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002922 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002923 break;
2924
2925 default:
2926 /* if we stumble upon an unknown formatting code, copy the rest
2927 of the format string to the output string. (we cannot just
2928 skip the code, since there's no way to know what's in the
2929 argument list) */
2930 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002931 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002932 return NULL;
2933 f = p+len;
2934 return f;
2935 }
2936
2937 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002938 return f;
2939}
2940
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941PyObject *
2942PyUnicode_FromFormatV(const char *format, va_list vargs)
2943{
Victor Stinnere215d962012-10-06 23:03:36 +02002944 va_list vargs2;
2945 const char *f;
2946 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002947
Victor Stinner8f674cc2013-04-17 23:02:17 +02002948 _PyUnicodeWriter_Init(&writer);
2949 writer.min_length = strlen(format) + 100;
2950 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002951
Benjamin Peterson0c212142016-09-20 20:39:33 -07002952 // Copy varags to be able to pass a reference to a subfunction.
2953 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002954
2955 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002956 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002957 f = unicode_fromformat_arg(&writer, f, &vargs2);
2958 if (f == NULL)
2959 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002960 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002961 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002962 const char *p;
2963 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002964
Victor Stinnere215d962012-10-06 23:03:36 +02002965 p = f;
2966 do
2967 {
2968 if ((unsigned char)*p > 127) {
2969 PyErr_Format(PyExc_ValueError,
2970 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2971 "string, got a non-ASCII byte: 0x%02x",
2972 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002973 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002974 }
2975 p++;
2976 }
2977 while (*p != '\0' && *p != '%');
2978 len = p - f;
2979
2980 if (*p == '\0')
2981 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002982
2983 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002984 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002985
2986 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002987 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002988 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002989 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002990 return _PyUnicodeWriter_Finish(&writer);
2991
2992 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002993 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002994 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002995 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002996}
2997
Walter Dörwaldd2034312007-05-18 16:29:38 +00002998PyObject *
2999PyUnicode_FromFormat(const char *format, ...)
3000{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003001 PyObject* ret;
3002 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003003
3004#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003005 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003006#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003007 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003008#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003009 ret = PyUnicode_FromFormatV(format, vargs);
3010 va_end(vargs);
3011 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003012}
3013
Serhiy Storchakac46db922018-10-23 22:58:24 +03003014static Py_ssize_t
3015unicode_get_widechar_size(PyObject *unicode)
3016{
3017 Py_ssize_t res;
3018
3019 assert(unicode != NULL);
3020 assert(_PyUnicode_CHECK(unicode));
3021
3022 if (_PyUnicode_WSTR(unicode) != NULL) {
3023 return PyUnicode_WSTR_LENGTH(unicode);
3024 }
3025 assert(PyUnicode_IS_READY(unicode));
3026
3027 res = _PyUnicode_LENGTH(unicode);
3028#if SIZEOF_WCHAR_T == 2
3029 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3030 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3031 const Py_UCS4 *end = s + res;
3032 for (; s < end; ++s) {
3033 if (*s > 0xFFFF) {
3034 ++res;
3035 }
3036 }
3037 }
3038#endif
3039 return res;
3040}
3041
3042static void
3043unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3044{
3045 const wchar_t *wstr;
3046
3047 assert(unicode != NULL);
3048 assert(_PyUnicode_CHECK(unicode));
3049
3050 wstr = _PyUnicode_WSTR(unicode);
3051 if (wstr != NULL) {
3052 memcpy(w, wstr, size * sizeof(wchar_t));
3053 return;
3054 }
3055 assert(PyUnicode_IS_READY(unicode));
3056
3057 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3058 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3059 for (; size--; ++s, ++w) {
3060 *w = *s;
3061 }
3062 }
3063 else {
3064#if SIZEOF_WCHAR_T == 4
3065 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3066 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3067 for (; size--; ++s, ++w) {
3068 *w = *s;
3069 }
3070#else
3071 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3072 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3073 for (; size--; ++s, ++w) {
3074 Py_UCS4 ch = *s;
3075 if (ch > 0xFFFF) {
3076 assert(ch <= MAX_UNICODE);
3077 /* encode surrogate pair in this case */
3078 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3079 if (!size--)
3080 break;
3081 *w = Py_UNICODE_LOW_SURROGATE(ch);
3082 }
3083 else {
3084 *w = ch;
3085 }
3086 }
3087#endif
3088 }
3089}
3090
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003091#ifdef HAVE_WCHAR_H
3092
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003093/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003094
Victor Stinnerd88d9832011-09-06 02:00:05 +02003095 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003096 character) required to convert the unicode object. Ignore size argument.
3097
Victor Stinnerd88d9832011-09-06 02:00:05 +02003098 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003099 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003100 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003101Py_ssize_t
3102PyUnicode_AsWideChar(PyObject *unicode,
3103 wchar_t *w,
3104 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003105{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003106 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003107
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003108 if (unicode == NULL) {
3109 PyErr_BadInternalCall();
3110 return -1;
3111 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003112 if (!PyUnicode_Check(unicode)) {
3113 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003114 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003115 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003116
3117 res = unicode_get_widechar_size(unicode);
3118 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003119 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003120 }
3121
3122 if (size > res) {
3123 size = res + 1;
3124 }
3125 else {
3126 res = size;
3127 }
3128 unicode_copy_as_widechar(unicode, w, size);
3129 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003130}
3131
Victor Stinner137c34c2010-09-29 10:25:54 +00003132wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003133PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003134 Py_ssize_t *size)
3135{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003136 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003137 Py_ssize_t buflen;
3138
3139 if (unicode == NULL) {
3140 PyErr_BadInternalCall();
3141 return NULL;
3142 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003143 if (!PyUnicode_Check(unicode)) {
3144 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003145 return NULL;
3146 }
3147
Serhiy Storchakac46db922018-10-23 22:58:24 +03003148 buflen = unicode_get_widechar_size(unicode);
3149 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003150 if (buffer == NULL) {
3151 PyErr_NoMemory();
3152 return NULL;
3153 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003154 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3155 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003156 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003157 }
3158 else if (wcslen(buffer) != (size_t)buflen) {
3159 PyMem_FREE(buffer);
3160 PyErr_SetString(PyExc_ValueError,
3161 "embedded null character");
3162 return NULL;
3163 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003164 return buffer;
3165}
3166
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003167#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003168
Alexander Belopolsky40018472011-02-26 01:02:56 +00003169PyObject *
3170PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003171{
Victor Stinner8faf8212011-12-08 22:14:11 +01003172 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003173 PyErr_SetString(PyExc_ValueError,
3174 "chr() arg not in range(0x110000)");
3175 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003176 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003177
Victor Stinner985a82a2014-01-03 12:53:47 +01003178 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003179}
3180
Alexander Belopolsky40018472011-02-26 01:02:56 +00003181PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003182PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003183{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003184 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003185 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003186 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003187 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003188 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003189 Py_INCREF(obj);
3190 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003191 }
3192 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003193 /* For a Unicode subtype that's not a Unicode object,
3194 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003195 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003196 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003197 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003198 "Can't convert '%.100s' object to str implicitly",
3199 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003200 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003201}
3202
Alexander Belopolsky40018472011-02-26 01:02:56 +00003203PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003204PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003205 const char *encoding,
3206 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003207{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003208 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003209 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003210
Guido van Rossumd57fd912000-03-10 22:53:23 +00003211 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003212 PyErr_BadInternalCall();
3213 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003214 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003215
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003216 /* Decoding bytes objects is the most common case and should be fast */
3217 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003218 if (PyBytes_GET_SIZE(obj) == 0)
3219 _Py_RETURN_UNICODE_EMPTY();
3220 v = PyUnicode_Decode(
3221 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3222 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003223 return v;
3224 }
3225
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003226 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003227 PyErr_SetString(PyExc_TypeError,
3228 "decoding str is not supported");
3229 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003230 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003231
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003232 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3233 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3234 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003235 "decoding to str: need a bytes-like object, %.80s found",
3236 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003237 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003238 }
Tim Petersced69f82003-09-16 20:30:58 +00003239
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003240 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003241 PyBuffer_Release(&buffer);
3242 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003243 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003244
Serhiy Storchaka05997252013-01-26 12:14:02 +02003245 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003246 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003247 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003248}
3249
Victor Stinnerebe17e02016-10-12 13:57:45 +02003250/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3251 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3252 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003253int
3254_Py_normalize_encoding(const char *encoding,
3255 char *lower,
3256 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003257{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003258 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003259 char *l;
3260 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003261 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003262
Victor Stinner942889a2016-09-05 15:40:10 -07003263 assert(encoding != NULL);
3264
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003265 e = encoding;
3266 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003267 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003268 punct = 0;
3269 while (1) {
3270 char c = *e;
3271 if (c == 0) {
3272 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003273 }
Victor Stinner942889a2016-09-05 15:40:10 -07003274
3275 if (Py_ISALNUM(c) || c == '.') {
3276 if (punct && l != lower) {
3277 if (l == l_end) {
3278 return 0;
3279 }
3280 *l++ = '_';
3281 }
3282 punct = 0;
3283
3284 if (l == l_end) {
3285 return 0;
3286 }
3287 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003288 }
3289 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003290 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003291 }
Victor Stinner942889a2016-09-05 15:40:10 -07003292
3293 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003294 }
3295 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003296 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003297}
3298
Alexander Belopolsky40018472011-02-26 01:02:56 +00003299PyObject *
3300PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003301 Py_ssize_t size,
3302 const char *encoding,
3303 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003304{
3305 PyObject *buffer = NULL, *unicode;
3306 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003307 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3308
3309 if (encoding == NULL) {
3310 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3311 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003312
Fred Drakee4315f52000-05-09 19:53:39 +00003313 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003314 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3315 char *lower = buflower;
3316
3317 /* Fast paths */
3318 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3319 lower += 3;
3320 if (*lower == '_') {
3321 /* Match "utf8" and "utf_8" */
3322 lower++;
3323 }
3324
3325 if (lower[0] == '8' && lower[1] == 0) {
3326 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3327 }
3328 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3329 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3330 }
3331 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3332 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3333 }
3334 }
3335 else {
3336 if (strcmp(lower, "ascii") == 0
3337 || strcmp(lower, "us_ascii") == 0) {
3338 return PyUnicode_DecodeASCII(s, size, errors);
3339 }
Steve Dowercc16be82016-09-08 10:35:16 -07003340 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003341 else if (strcmp(lower, "mbcs") == 0) {
3342 return PyUnicode_DecodeMBCS(s, size, errors);
3343 }
3344 #endif
3345 else if (strcmp(lower, "latin1") == 0
3346 || strcmp(lower, "latin_1") == 0
3347 || strcmp(lower, "iso_8859_1") == 0
3348 || strcmp(lower, "iso8859_1") == 0) {
3349 return PyUnicode_DecodeLatin1(s, size, errors);
3350 }
3351 }
Victor Stinner37296e82010-06-10 13:36:23 +00003352 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353
3354 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003355 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003356 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003357 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003358 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003359 if (buffer == NULL)
3360 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003361 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003362 if (unicode == NULL)
3363 goto onError;
3364 if (!PyUnicode_Check(unicode)) {
3365 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003366 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003367 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003368 encoding,
3369 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003370 Py_DECREF(unicode);
3371 goto onError;
3372 }
3373 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003374 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003375
Benjamin Peterson29060642009-01-31 22:14:21 +00003376 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003377 Py_XDECREF(buffer);
3378 return NULL;
3379}
3380
Alexander Belopolsky40018472011-02-26 01:02:56 +00003381PyObject *
3382PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003383 const char *encoding,
3384 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003385{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003386 if (!PyUnicode_Check(unicode)) {
3387 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003388 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003389 }
3390
Serhiy Storchaka00939072016-10-27 21:05:49 +03003391 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3392 "PyUnicode_AsDecodedObject() is deprecated; "
3393 "use PyCodec_Decode() to decode from str", 1) < 0)
3394 return NULL;
3395
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003396 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003397 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003398
3399 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003400 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003401}
3402
Alexander Belopolsky40018472011-02-26 01:02:56 +00003403PyObject *
3404PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003405 const char *encoding,
3406 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003407{
3408 PyObject *v;
3409
3410 if (!PyUnicode_Check(unicode)) {
3411 PyErr_BadArgument();
3412 goto onError;
3413 }
3414
Serhiy Storchaka00939072016-10-27 21:05:49 +03003415 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3416 "PyUnicode_AsDecodedUnicode() is deprecated; "
3417 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3418 return NULL;
3419
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003420 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003421 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003422
3423 /* Decode via the codec registry */
3424 v = PyCodec_Decode(unicode, encoding, errors);
3425 if (v == NULL)
3426 goto onError;
3427 if (!PyUnicode_Check(v)) {
3428 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003429 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003430 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003431 encoding,
3432 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003433 Py_DECREF(v);
3434 goto onError;
3435 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003436 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003437
Benjamin Peterson29060642009-01-31 22:14:21 +00003438 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003439 return NULL;
3440}
3441
Alexander Belopolsky40018472011-02-26 01:02:56 +00003442PyObject *
3443PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003444 Py_ssize_t size,
3445 const char *encoding,
3446 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447{
3448 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003449
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003450 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003451 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003453 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3454 Py_DECREF(unicode);
3455 return v;
3456}
3457
Alexander Belopolsky40018472011-02-26 01:02:56 +00003458PyObject *
3459PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003460 const char *encoding,
3461 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003462{
3463 PyObject *v;
3464
3465 if (!PyUnicode_Check(unicode)) {
3466 PyErr_BadArgument();
3467 goto onError;
3468 }
3469
Serhiy Storchaka00939072016-10-27 21:05:49 +03003470 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3471 "PyUnicode_AsEncodedObject() is deprecated; "
3472 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3473 "or PyCodec_Encode() for generic encoding", 1) < 0)
3474 return NULL;
3475
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003476 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003477 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003478
3479 /* Encode via the codec registry */
3480 v = PyCodec_Encode(unicode, encoding, errors);
3481 if (v == NULL)
3482 goto onError;
3483 return v;
3484
Benjamin Peterson29060642009-01-31 22:14:21 +00003485 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003486 return NULL;
3487}
3488
Victor Stinner1b579672011-12-17 05:47:23 +01003489
Victor Stinner2cba6b82018-01-10 22:46:15 +01003490static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003491unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003492 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003494 Py_ssize_t wlen;
3495 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3496 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003497 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003498 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003499
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003500 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003501 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003502 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003503 return NULL;
3504 }
3505
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003506 char *str;
3507 size_t error_pos;
3508 const char *reason;
3509 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003510 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003511 PyMem_Free(wstr);
3512
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003513 if (res != 0) {
3514 if (res == -2) {
3515 PyObject *exc;
3516 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3517 "locale", unicode,
3518 (Py_ssize_t)error_pos,
3519 (Py_ssize_t)(error_pos+1),
3520 reason);
3521 if (exc != NULL) {
3522 PyCodec_StrictErrors(exc);
3523 Py_DECREF(exc);
3524 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003525 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003526 else if (res == -3) {
3527 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3528 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003529 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003530 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003531 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003532 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003533 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003534
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003535 PyObject *bytes = PyBytes_FromString(str);
3536 PyMem_RawFree(str);
3537 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003538}
3539
Victor Stinnerad158722010-10-27 00:25:46 +00003540PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003541PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3542{
Victor Stinner709d23d2019-05-02 14:56:30 -04003543 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3544 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003545}
3546
3547PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003548PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003549{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003550 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003551#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003552 if (interp->fs_codec.encoding) {
3553 return unicode_encode_utf8(unicode,
3554 interp->fs_codec.error_handler,
3555 interp->fs_codec.errors);
3556 }
3557 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003558 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003559 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003560 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003561 assert(errors != _Py_ERROR_UNKNOWN);
3562 return unicode_encode_utf8(unicode, errors, NULL);
3563 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003564#else
Victor Stinner793b5312011-04-27 00:24:21 +02003565 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3566 cannot use it to encode and decode filenames before it is loaded. Load
3567 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003568 implementation of the locale codec until the codec registry is
3569 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003570 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003571 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003572 interp->fs_codec.encoding,
3573 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003574 }
3575 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003576 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003577 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003578 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003579 assert(errors != _Py_ERROR_UNKNOWN);
3580 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003581 }
Victor Stinnerad158722010-10-27 00:25:46 +00003582#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003583}
3584
Alexander Belopolsky40018472011-02-26 01:02:56 +00003585PyObject *
3586PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003587 const char *encoding,
3588 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003589{
3590 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003591 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003592
Guido van Rossumd57fd912000-03-10 22:53:23 +00003593 if (!PyUnicode_Check(unicode)) {
3594 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003595 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003596 }
Fred Drakee4315f52000-05-09 19:53:39 +00003597
Victor Stinner942889a2016-09-05 15:40:10 -07003598 if (encoding == NULL) {
3599 return _PyUnicode_AsUTF8String(unicode, errors);
3600 }
3601
Fred Drakee4315f52000-05-09 19:53:39 +00003602 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003603 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3604 char *lower = buflower;
3605
3606 /* Fast paths */
3607 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3608 lower += 3;
3609 if (*lower == '_') {
3610 /* Match "utf8" and "utf_8" */
3611 lower++;
3612 }
3613
3614 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003615 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003616 }
3617 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3618 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3619 }
3620 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3621 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3622 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003623 }
Victor Stinner942889a2016-09-05 15:40:10 -07003624 else {
3625 if (strcmp(lower, "ascii") == 0
3626 || strcmp(lower, "us_ascii") == 0) {
3627 return _PyUnicode_AsASCIIString(unicode, errors);
3628 }
Steve Dowercc16be82016-09-08 10:35:16 -07003629#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003630 else if (strcmp(lower, "mbcs") == 0) {
3631 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3632 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003633#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003634 else if (strcmp(lower, "latin1") == 0 ||
3635 strcmp(lower, "latin_1") == 0 ||
3636 strcmp(lower, "iso_8859_1") == 0 ||
3637 strcmp(lower, "iso8859_1") == 0) {
3638 return _PyUnicode_AsLatin1String(unicode, errors);
3639 }
3640 }
Victor Stinner37296e82010-06-10 13:36:23 +00003641 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003642
3643 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003644 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003645 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003646 return NULL;
3647
3648 /* The normal path */
3649 if (PyBytes_Check(v))
3650 return v;
3651
3652 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003653 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003654 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003655 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003656
3657 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003658 "encoder %s returned bytearray instead of bytes; "
3659 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003660 encoding);
3661 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003662 Py_DECREF(v);
3663 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003664 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003665
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003666 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3667 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003668 Py_DECREF(v);
3669 return b;
3670 }
3671
3672 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003673 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003674 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003675 encoding,
3676 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003677 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003678 return NULL;
3679}
3680
Alexander Belopolsky40018472011-02-26 01:02:56 +00003681PyObject *
3682PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003683 const char *encoding,
3684 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003685{
3686 PyObject *v;
3687
3688 if (!PyUnicode_Check(unicode)) {
3689 PyErr_BadArgument();
3690 goto onError;
3691 }
3692
Serhiy Storchaka00939072016-10-27 21:05:49 +03003693 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3694 "PyUnicode_AsEncodedUnicode() is deprecated; "
3695 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3696 return NULL;
3697
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003698 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003699 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003700
3701 /* Encode via the codec registry */
3702 v = PyCodec_Encode(unicode, encoding, errors);
3703 if (v == NULL)
3704 goto onError;
3705 if (!PyUnicode_Check(v)) {
3706 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003707 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003708 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003709 encoding,
3710 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003711 Py_DECREF(v);
3712 goto onError;
3713 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003714 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003715
Benjamin Peterson29060642009-01-31 22:14:21 +00003716 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003717 return NULL;
3718}
3719
Victor Stinner2cba6b82018-01-10 22:46:15 +01003720static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003721unicode_decode_locale(const char *str, Py_ssize_t len,
3722 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003723{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003724 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3725 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003726 return NULL;
3727 }
3728
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003729 wchar_t *wstr;
3730 size_t wlen;
3731 const char *reason;
3732 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003733 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003734 if (res != 0) {
3735 if (res == -2) {
3736 PyObject *exc;
3737 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3738 "locale", str, len,
3739 (Py_ssize_t)wlen,
3740 (Py_ssize_t)(wlen + 1),
3741 reason);
3742 if (exc != NULL) {
3743 PyCodec_StrictErrors(exc);
3744 Py_DECREF(exc);
3745 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003746 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003747 else if (res == -3) {
3748 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3749 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003750 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003751 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003752 }
Victor Stinner2f197072011-12-17 07:08:30 +01003753 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003754 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003755
3756 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3757 PyMem_RawFree(wstr);
3758 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003759}
3760
3761PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003762PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3763 const char *errors)
3764{
Victor Stinner709d23d2019-05-02 14:56:30 -04003765 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3766 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003767}
3768
3769PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003770PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003771{
3772 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003773 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3774 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003775}
3776
3777
3778PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003779PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003780 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003781 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3782}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003783
Christian Heimes5894ba72007-11-04 11:43:14 +00003784PyObject*
3785PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3786{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003787 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003788#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003789 if (interp->fs_codec.encoding) {
3790 return unicode_decode_utf8(s, size,
3791 interp->fs_codec.error_handler,
3792 interp->fs_codec.errors,
3793 NULL);
3794 }
3795 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003796 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003797 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003798 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003799 assert(errors != _Py_ERROR_UNKNOWN);
3800 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3801 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003802#else
Victor Stinner793b5312011-04-27 00:24:21 +02003803 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3804 cannot use it to encode and decode filenames before it is loaded. Load
3805 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003806 implementation of the locale codec until the codec registry is
3807 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003808 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003809 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003810 interp->fs_codec.encoding,
3811 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003812 }
3813 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003814 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003815 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003816 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003817 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003818 }
Victor Stinnerad158722010-10-27 00:25:46 +00003819#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003820}
3821
Martin v. Löwis011e8422009-05-05 04:43:17 +00003822
3823int
3824PyUnicode_FSConverter(PyObject* arg, void* addr)
3825{
Brett Cannonec6ce872016-09-06 15:50:29 -07003826 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003827 PyObject *output = NULL;
3828 Py_ssize_t size;
3829 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003830 if (arg == NULL) {
3831 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003832 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003833 return 1;
3834 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003835 path = PyOS_FSPath(arg);
3836 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003837 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003838 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003839 if (PyBytes_Check(path)) {
3840 output = path;
3841 }
3842 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3843 output = PyUnicode_EncodeFSDefault(path);
3844 Py_DECREF(path);
3845 if (!output) {
3846 return 0;
3847 }
3848 assert(PyBytes_Check(output));
3849 }
3850
Victor Stinner0ea2a462010-04-30 00:22:08 +00003851 size = PyBytes_GET_SIZE(output);
3852 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003853 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003854 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003855 Py_DECREF(output);
3856 return 0;
3857 }
3858 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003859 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003860}
3861
3862
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003863int
3864PyUnicode_FSDecoder(PyObject* arg, void* addr)
3865{
Brett Cannona5711202016-09-06 19:36:01 -07003866 int is_buffer = 0;
3867 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003868 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003869 if (arg == NULL) {
3870 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003871 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003872 return 1;
3873 }
Brett Cannona5711202016-09-06 19:36:01 -07003874
3875 is_buffer = PyObject_CheckBuffer(arg);
3876 if (!is_buffer) {
3877 path = PyOS_FSPath(arg);
3878 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003879 return 0;
3880 }
Brett Cannona5711202016-09-06 19:36:01 -07003881 }
3882 else {
3883 path = arg;
3884 Py_INCREF(arg);
3885 }
3886
3887 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003888 output = path;
3889 }
3890 else if (PyBytes_Check(path) || is_buffer) {
3891 PyObject *path_bytes = NULL;
3892
3893 if (!PyBytes_Check(path) &&
3894 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003895 "path should be string, bytes, or os.PathLike, not %.200s",
3896 Py_TYPE(arg)->tp_name)) {
3897 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003898 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003899 }
3900 path_bytes = PyBytes_FromObject(path);
3901 Py_DECREF(path);
3902 if (!path_bytes) {
3903 return 0;
3904 }
3905 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3906 PyBytes_GET_SIZE(path_bytes));
3907 Py_DECREF(path_bytes);
3908 if (!output) {
3909 return 0;
3910 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003911 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003912 else {
3913 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003914 "path should be string, bytes, or os.PathLike, not %.200s",
3915 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003916 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003917 return 0;
3918 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003919 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003920 Py_DECREF(output);
3921 return 0;
3922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003923 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003924 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003925 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926 Py_DECREF(output);
3927 return 0;
3928 }
3929 *(PyObject**)addr = output;
3930 return Py_CLEANUP_SUPPORTED;
3931}
3932
3933
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003934const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003935PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003936{
Christian Heimesf3863112007-11-22 07:46:41 +00003937 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003938
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003939 if (!PyUnicode_Check(unicode)) {
3940 PyErr_BadArgument();
3941 return NULL;
3942 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003943 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003944 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003945
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003946 if (PyUnicode_UTF8(unicode) == NULL) {
3947 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003948 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003949 if (bytes == NULL)
3950 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003951 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3952 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003953 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003954 Py_DECREF(bytes);
3955 return NULL;
3956 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003957 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003958 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003959 PyBytes_AS_STRING(bytes),
3960 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003961 Py_DECREF(bytes);
3962 }
3963
3964 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003965 *psize = PyUnicode_UTF8_LENGTH(unicode);
3966 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003967}
3968
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003969const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003970PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3973}
3974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003975Py_UNICODE *
3976PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3977{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 if (!PyUnicode_Check(unicode)) {
3979 PyErr_BadArgument();
3980 return NULL;
3981 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003982 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3983 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003984 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003985 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003986 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003987
Serhiy Storchakac46db922018-10-23 22:58:24 +03003988 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3989 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3990 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003991 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003992 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003993 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3994 if (w == NULL) {
3995 PyErr_NoMemory();
3996 return NULL;
3997 }
3998 unicode_copy_as_widechar(unicode, w, wlen + 1);
3999 _PyUnicode_WSTR(unicode) = w;
4000 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4001 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004002 }
4003 }
4004 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004005 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004006 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004007}
4008
Alexander Belopolsky40018472011-02-26 01:02:56 +00004009Py_UNICODE *
4010PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004011{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004012 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004013}
4014
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004015const Py_UNICODE *
4016_PyUnicode_AsUnicode(PyObject *unicode)
4017{
4018 Py_ssize_t size;
4019 const Py_UNICODE *wstr;
4020
4021 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4022 if (wstr && wcslen(wstr) != (size_t)size) {
4023 PyErr_SetString(PyExc_ValueError, "embedded null character");
4024 return NULL;
4025 }
4026 return wstr;
4027}
4028
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004029
Alexander Belopolsky40018472011-02-26 01:02:56 +00004030Py_ssize_t
4031PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004032{
4033 if (!PyUnicode_Check(unicode)) {
4034 PyErr_BadArgument();
4035 goto onError;
4036 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004037 if (_PyUnicode_WSTR(unicode) == NULL) {
4038 if (PyUnicode_AsUnicode(unicode) == NULL)
4039 goto onError;
4040 }
4041 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004042
Benjamin Peterson29060642009-01-31 22:14:21 +00004043 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004044 return -1;
4045}
4046
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004047Py_ssize_t
4048PyUnicode_GetLength(PyObject *unicode)
4049{
Victor Stinner07621332012-06-16 04:53:46 +02004050 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004051 PyErr_BadArgument();
4052 return -1;
4053 }
Victor Stinner07621332012-06-16 04:53:46 +02004054 if (PyUnicode_READY(unicode) == -1)
4055 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004056 return PyUnicode_GET_LENGTH(unicode);
4057}
4058
4059Py_UCS4
4060PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4061{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004062 void *data;
4063 int kind;
4064
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004065 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004066 PyErr_BadArgument();
4067 return (Py_UCS4)-1;
4068 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004069 if (PyUnicode_READY(unicode) == -1) {
4070 return (Py_UCS4)-1;
4071 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004072 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004073 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004074 return (Py_UCS4)-1;
4075 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004076 data = PyUnicode_DATA(unicode);
4077 kind = PyUnicode_KIND(unicode);
4078 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004079}
4080
4081int
4082PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4083{
4084 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004085 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004086 return -1;
4087 }
Victor Stinner488fa492011-12-12 00:01:39 +01004088 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004089 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004090 PyErr_SetString(PyExc_IndexError, "string index out of range");
4091 return -1;
4092 }
Victor Stinner488fa492011-12-12 00:01:39 +01004093 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004094 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004095 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4096 PyErr_SetString(PyExc_ValueError, "character out of range");
4097 return -1;
4098 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004099 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4100 index, ch);
4101 return 0;
4102}
4103
Alexander Belopolsky40018472011-02-26 01:02:56 +00004104const char *
4105PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004106{
Victor Stinner42cb4622010-09-01 19:39:01 +00004107 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004108}
4109
Victor Stinner554f3f02010-06-16 23:33:54 +00004110/* create or adjust a UnicodeDecodeError */
4111static void
4112make_decode_exception(PyObject **exceptionObject,
4113 const char *encoding,
4114 const char *input, Py_ssize_t length,
4115 Py_ssize_t startpos, Py_ssize_t endpos,
4116 const char *reason)
4117{
4118 if (*exceptionObject == NULL) {
4119 *exceptionObject = PyUnicodeDecodeError_Create(
4120 encoding, input, length, startpos, endpos, reason);
4121 }
4122 else {
4123 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4124 goto onError;
4125 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4126 goto onError;
4127 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4128 goto onError;
4129 }
4130 return;
4131
4132onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004133 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004134}
4135
Steve Dowercc16be82016-09-08 10:35:16 -07004136#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004137static int
4138widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4139{
4140 if (newsize > *size) {
4141 wchar_t *newbuf = *buf;
4142 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4143 PyErr_NoMemory();
4144 return -1;
4145 }
4146 *buf = newbuf;
4147 }
4148 *size = newsize;
4149 return 0;
4150}
4151
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004152/* error handling callback helper:
4153 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004154 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004155 and adjust various state variables.
4156 return 0 on success, -1 on error
4157*/
4158
Alexander Belopolsky40018472011-02-26 01:02:56 +00004159static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004160unicode_decode_call_errorhandler_wchar(
4161 const char *errors, PyObject **errorHandler,
4162 const char *encoding, const char *reason,
4163 const char **input, const char **inend, Py_ssize_t *startinpos,
4164 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004165 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004166{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004167 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004168
4169 PyObject *restuple = NULL;
4170 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004171 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004172 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004173 Py_ssize_t requiredsize;
4174 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004175 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004176 wchar_t *repwstr;
4177 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004178
4179 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004180 *errorHandler = PyCodec_LookupError(errors);
4181 if (*errorHandler == NULL)
4182 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004183 }
4184
Victor Stinner554f3f02010-06-16 23:33:54 +00004185 make_decode_exception(exceptionObject,
4186 encoding,
4187 *input, *inend - *input,
4188 *startinpos, *endinpos,
4189 reason);
4190 if (*exceptionObject == NULL)
4191 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004192
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004193 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004194 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004196 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004197 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004198 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004199 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004200 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004201 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202
4203 /* Copy back the bytes variables, which might have been modified by the
4204 callback */
4205 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4206 if (!inputobj)
4207 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004208 *input = PyBytes_AS_STRING(inputobj);
4209 insize = PyBytes_GET_SIZE(inputobj);
4210 *inend = *input + insize;
4211 /* we can DECREF safely, as the exception has another reference,
4212 so the object won't go away. */
4213 Py_DECREF(inputobj);
4214
4215 if (newpos<0)
4216 newpos = insize+newpos;
4217 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004218 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004219 goto onError;
4220 }
4221
4222 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4223 if (repwstr == NULL)
4224 goto onError;
4225 /* need more space? (at least enough for what we
4226 have+the replacement+the rest of the string (starting
4227 at the new input position), so we won't have to check space
4228 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004229 requiredsize = *outpos;
4230 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4231 goto overflow;
4232 requiredsize += repwlen;
4233 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4234 goto overflow;
4235 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004236 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004238 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004240 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004241 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004242 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004243 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004244 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004246 *endinpos = newpos;
4247 *inptr = *input + newpos;
4248
4249 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004250 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 return 0;
4252
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004253 overflow:
4254 PyErr_SetString(PyExc_OverflowError,
4255 "decoded result is too long for a Python string");
4256
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004257 onError:
4258 Py_XDECREF(restuple);
4259 return -1;
4260}
Steve Dowercc16be82016-09-08 10:35:16 -07004261#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004262
4263static int
4264unicode_decode_call_errorhandler_writer(
4265 const char *errors, PyObject **errorHandler,
4266 const char *encoding, const char *reason,
4267 const char **input, const char **inend, Py_ssize_t *startinpos,
4268 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4269 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4270{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004271 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004272
4273 PyObject *restuple = NULL;
4274 PyObject *repunicode = NULL;
4275 Py_ssize_t insize;
4276 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004277 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004278 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004279 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004280 int need_to_grow = 0;
4281 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004282
4283 if (*errorHandler == NULL) {
4284 *errorHandler = PyCodec_LookupError(errors);
4285 if (*errorHandler == NULL)
4286 goto onError;
4287 }
4288
4289 make_decode_exception(exceptionObject,
4290 encoding,
4291 *input, *inend - *input,
4292 *startinpos, *endinpos,
4293 reason);
4294 if (*exceptionObject == NULL)
4295 goto onError;
4296
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004297 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004298 if (restuple == NULL)
4299 goto onError;
4300 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004301 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004302 goto onError;
4303 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004304 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004305 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004306
4307 /* Copy back the bytes variables, which might have been modified by the
4308 callback */
4309 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4310 if (!inputobj)
4311 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004312 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004313 *input = PyBytes_AS_STRING(inputobj);
4314 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004315 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004316 /* we can DECREF safely, as the exception has another reference,
4317 so the object won't go away. */
4318 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004319
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004321 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004322 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004323 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004324 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004325 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004326
Victor Stinner170ca6f2013-04-18 00:25:28 +02004327 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004328 if (replen > 1) {
4329 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004330 need_to_grow = 1;
4331 }
4332 new_inptr = *input + newpos;
4333 if (*inend - new_inptr > remain) {
4334 /* We don't know the decoding algorithm here so we make the worst
4335 assumption that one byte decodes to one unicode character.
4336 If unfortunately one byte could decode to more unicode characters,
4337 the decoder may write out-of-bound then. Is it possible for the
4338 algorithms using this function? */
4339 writer->min_length += *inend - new_inptr - remain;
4340 need_to_grow = 1;
4341 }
4342 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004343 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004344 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004345 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4346 goto onError;
4347 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004348 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004349 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004352 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004353
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004354 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004355 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004356 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004357
Benjamin Peterson29060642009-01-31 22:14:21 +00004358 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004359 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004360 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004361}
4362
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004363/* --- UTF-7 Codec -------------------------------------------------------- */
4364
Antoine Pitrou244651a2009-05-04 18:56:13 +00004365/* See RFC2152 for details. We encode conservatively and decode liberally. */
4366
4367/* Three simple macros defining base-64. */
4368
4369/* Is c a base-64 character? */
4370
4371#define IS_BASE64(c) \
4372 (((c) >= 'A' && (c) <= 'Z') || \
4373 ((c) >= 'a' && (c) <= 'z') || \
4374 ((c) >= '0' && (c) <= '9') || \
4375 (c) == '+' || (c) == '/')
4376
4377/* given that c is a base-64 character, what is its base-64 value? */
4378
4379#define FROM_BASE64(c) \
4380 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4381 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4382 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4383 (c) == '+' ? 62 : 63)
4384
4385/* What is the base-64 character of the bottom 6 bits of n? */
4386
4387#define TO_BASE64(n) \
4388 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4389
4390/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4391 * decoded as itself. We are permissive on decoding; the only ASCII
4392 * byte not decoding to itself is the + which begins a base64
4393 * string. */
4394
4395#define DECODE_DIRECT(c) \
4396 ((c) <= 127 && (c) != '+')
4397
4398/* The UTF-7 encoder treats ASCII characters differently according to
4399 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4400 * the above). See RFC2152. This array identifies these different
4401 * sets:
4402 * 0 : "Set D"
4403 * alphanumeric and '(),-./:?
4404 * 1 : "Set O"
4405 * !"#$%&*;<=>@[]^_`{|}
4406 * 2 : "whitespace"
4407 * ht nl cr sp
4408 * 3 : special (must be base64 encoded)
4409 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4410 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004411
Tim Petersced69f82003-09-16 20:30:58 +00004412static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004413char utf7_category[128] = {
4414/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4415 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4416/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4417 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4418/* sp ! " # $ % & ' ( ) * + , - . / */
4419 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4420/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4421 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4422/* @ A B C D E F G H I J K L M N O */
4423 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4424/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4425 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4426/* ` a b c d e f g h i j k l m n o */
4427 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4428/* p q r s t u v w x y z { | } ~ del */
4429 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004430};
4431
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432/* ENCODE_DIRECT: this character should be encoded as itself. The
4433 * answer depends on whether we are encoding set O as itself, and also
4434 * on whether we are encoding whitespace as itself. RFC2152 makes it
4435 * clear that the answers to these questions vary between
4436 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004437
Antoine Pitrou244651a2009-05-04 18:56:13 +00004438#define ENCODE_DIRECT(c, directO, directWS) \
4439 ((c) < 128 && (c) > 0 && \
4440 ((utf7_category[(c)] == 0) || \
4441 (directWS && (utf7_category[(c)] == 2)) || \
4442 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004443
Alexander Belopolsky40018472011-02-26 01:02:56 +00004444PyObject *
4445PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004446 Py_ssize_t size,
4447 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004448{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004449 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4450}
4451
Antoine Pitrou244651a2009-05-04 18:56:13 +00004452/* The decoder. The only state we preserve is our read position,
4453 * i.e. how many characters we have consumed. So if we end in the
4454 * middle of a shift sequence we have to back off the read position
4455 * and the output to the beginning of the sequence, otherwise we lose
4456 * all the shift state (seen bits, number of bits seen, high
4457 * surrogate). */
4458
Alexander Belopolsky40018472011-02-26 01:02:56 +00004459PyObject *
4460PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004461 Py_ssize_t size,
4462 const char *errors,
4463 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004464{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004465 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004466 Py_ssize_t startinpos;
4467 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004468 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004469 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004470 const char *errmsg = "";
4471 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004472 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004473 unsigned int base64bits = 0;
4474 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004475 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004476 PyObject *errorHandler = NULL;
4477 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004479 if (size == 0) {
4480 if (consumed)
4481 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004482 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004483 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004485 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004486 _PyUnicodeWriter_Init(&writer);
4487 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004488
4489 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490 e = s + size;
4491
4492 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004493 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004494 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004495 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004496
Antoine Pitrou244651a2009-05-04 18:56:13 +00004497 if (inShift) { /* in a base-64 section */
4498 if (IS_BASE64(ch)) { /* consume a base-64 character */
4499 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4500 base64bits += 6;
4501 s++;
4502 if (base64bits >= 16) {
4503 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004504 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004505 base64bits -= 16;
4506 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004507 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 if (surrogate) {
4509 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004510 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4511 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004512 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004513 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004515 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004516 }
4517 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004518 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004519 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004520 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 }
4522 }
Victor Stinner551ac952011-11-29 22:58:13 +01004523 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 /* first surrogate */
4525 surrogate = outCh;
4526 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004527 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004528 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004529 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004530 }
4531 }
4532 }
4533 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004534 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004535 if (base64bits > 0) { /* left-over bits */
4536 if (base64bits >= 6) {
4537 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004538 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004539 errmsg = "partial character in shift sequence";
4540 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004541 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004542 else {
4543 /* Some bits remain; they should be zero */
4544 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004545 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004546 errmsg = "non-zero padding bits in shift sequence";
4547 goto utf7Error;
4548 }
4549 }
4550 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004551 if (surrogate && DECODE_DIRECT(ch)) {
4552 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4553 goto onError;
4554 }
4555 surrogate = 0;
4556 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004557 /* '-' is absorbed; other terminating
4558 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004559 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004560 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 }
4562 }
4563 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004564 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004565 s++; /* consume '+' */
4566 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004567 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004568 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004569 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004571 else if (s < e && !IS_BASE64(*s)) {
4572 s++;
4573 errmsg = "ill-formed sequence";
4574 goto utf7Error;
4575 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004577 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004578 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004579 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004580 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004581 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 }
4583 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004584 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004585 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004586 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004587 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004588 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004589 else {
4590 startinpos = s-starts;
4591 s++;
4592 errmsg = "unexpected special character";
4593 goto utf7Error;
4594 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004595 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004596utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004597 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004598 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004599 errors, &errorHandler,
4600 "utf7", errmsg,
4601 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004602 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004603 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004604 }
4605
Antoine Pitrou244651a2009-05-04 18:56:13 +00004606 /* end of string */
4607
4608 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4609 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004610 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 if (surrogate ||
4612 (base64bits >= 6) ||
4613 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004615 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 errors, &errorHandler,
4617 "utf7", "unterminated shift sequence",
4618 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004619 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004620 goto onError;
4621 if (s < e)
4622 goto restart;
4623 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004624 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004625
4626 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004627 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004628 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004629 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004630 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004631 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004632 writer.kind, writer.data, shiftOutStart);
4633 Py_XDECREF(errorHandler);
4634 Py_XDECREF(exc);
4635 _PyUnicodeWriter_Dealloc(&writer);
4636 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004637 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004638 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004639 }
4640 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004641 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004643 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004644
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004645 Py_XDECREF(errorHandler);
4646 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004647 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004648
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004650 Py_XDECREF(errorHandler);
4651 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004652 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004653 return NULL;
4654}
4655
4656
Alexander Belopolsky40018472011-02-26 01:02:56 +00004657PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004658_PyUnicode_EncodeUTF7(PyObject *str,
4659 int base64SetO,
4660 int base64WhiteSpace,
4661 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004662{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004663 int kind;
4664 void *data;
4665 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004666 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004667 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004668 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004669 unsigned int base64bits = 0;
4670 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004671 char * out;
4672 char * start;
4673
Benjamin Petersonbac79492012-01-14 13:34:47 -05004674 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004675 return NULL;
4676 kind = PyUnicode_KIND(str);
4677 data = PyUnicode_DATA(str);
4678 len = PyUnicode_GET_LENGTH(str);
4679
4680 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004681 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004682
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004683 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004684 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004685 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004686 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687 if (v == NULL)
4688 return NULL;
4689
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004690 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004691 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004692 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004693
Antoine Pitrou244651a2009-05-04 18:56:13 +00004694 if (inShift) {
4695 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4696 /* shifting out */
4697 if (base64bits) { /* output remaining bits */
4698 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4699 base64buffer = 0;
4700 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 }
4702 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004703 /* Characters not in the BASE64 set implicitly unshift the sequence
4704 so no '-' is required, except if the character is itself a '-' */
4705 if (IS_BASE64(ch) || ch == '-') {
4706 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 *out++ = (char) ch;
4709 }
4710 else {
4711 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004712 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004714 else { /* not in a shift sequence */
4715 if (ch == '+') {
4716 *out++ = '+';
4717 *out++ = '-';
4718 }
4719 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4720 *out++ = (char) ch;
4721 }
4722 else {
4723 *out++ = '+';
4724 inShift = 1;
4725 goto encode_char;
4726 }
4727 }
4728 continue;
4729encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004731 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004732
Antoine Pitrou244651a2009-05-04 18:56:13 +00004733 /* code first surrogate */
4734 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004735 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004736 while (base64bits >= 6) {
4737 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4738 base64bits -= 6;
4739 }
4740 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004741 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004742 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004743 base64bits += 16;
4744 base64buffer = (base64buffer << 16) | ch;
4745 while (base64bits >= 6) {
4746 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4747 base64bits -= 6;
4748 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004749 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004750 if (base64bits)
4751 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4752 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004754 if (_PyBytes_Resize(&v, out - start) < 0)
4755 return NULL;
4756 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004757}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004758PyObject *
4759PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4760 Py_ssize_t size,
4761 int base64SetO,
4762 int base64WhiteSpace,
4763 const char *errors)
4764{
4765 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004766 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004767 if (tmp == NULL)
4768 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004769 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004770 base64WhiteSpace, errors);
4771 Py_DECREF(tmp);
4772 return result;
4773}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774
Antoine Pitrou244651a2009-05-04 18:56:13 +00004775#undef IS_BASE64
4776#undef FROM_BASE64
4777#undef TO_BASE64
4778#undef DECODE_DIRECT
4779#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004780
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781/* --- UTF-8 Codec -------------------------------------------------------- */
4782
Alexander Belopolsky40018472011-02-26 01:02:56 +00004783PyObject *
4784PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004785 Py_ssize_t size,
4786 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004787{
Walter Dörwald69652032004-09-07 20:24:22 +00004788 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4789}
4790
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004791#include "stringlib/asciilib.h"
4792#include "stringlib/codecs.h"
4793#include "stringlib/undef.h"
4794
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004795#include "stringlib/ucs1lib.h"
4796#include "stringlib/codecs.h"
4797#include "stringlib/undef.h"
4798
4799#include "stringlib/ucs2lib.h"
4800#include "stringlib/codecs.h"
4801#include "stringlib/undef.h"
4802
4803#include "stringlib/ucs4lib.h"
4804#include "stringlib/codecs.h"
4805#include "stringlib/undef.h"
4806
Antoine Pitrouab868312009-01-10 15:40:25 +00004807/* Mask to quickly check whether a C 'long' contains a
4808 non-ASCII, UTF8-encoded char. */
4809#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004810# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004811#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004812# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004813#else
4814# error C 'long' size should be either 4 or 8!
4815#endif
4816
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004817static Py_ssize_t
4818ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004819{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004820 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004821 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004822
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004823 /*
4824 * Issue #17237: m68k is a bit different from most architectures in
4825 * that objects do not use "natural alignment" - for example, int and
4826 * long are only aligned at 2-byte boundaries. Therefore the assert()
4827 * won't work; also, tests have shown that skipping the "optimised
4828 * version" will even speed up m68k.
4829 */
4830#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004831#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004832 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4833 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004834 /* Fast path, see in STRINGLIB(utf8_decode) for
4835 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004836 /* Help allocation */
4837 const char *_p = p;
4838 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004839 while (_p < aligned_end) {
4840 unsigned long value = *(const unsigned long *) _p;
4841 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004842 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004843 *((unsigned long *)q) = value;
4844 _p += SIZEOF_LONG;
4845 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004846 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 p = _p;
4848 while (p < end) {
4849 if ((unsigned char)*p & 0x80)
4850 break;
4851 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004852 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004853 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004854 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004855#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004856#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 while (p < end) {
4858 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4859 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004860 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004861 /* Help allocation */
4862 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863 while (_p < aligned_end) {
4864 unsigned long value = *(unsigned long *) _p;
4865 if (value & ASCII_CHAR_MASK)
4866 break;
4867 _p += SIZEOF_LONG;
4868 }
4869 p = _p;
4870 if (_p == end)
4871 break;
4872 }
4873 if ((unsigned char)*p & 0x80)
4874 break;
4875 ++p;
4876 }
4877 memcpy(dest, start, p - start);
4878 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004879}
Antoine Pitrouab868312009-01-10 15:40:25 +00004880
Victor Stinner709d23d2019-05-02 14:56:30 -04004881static PyObject *
4882unicode_decode_utf8(const char *s, Py_ssize_t size,
4883 _Py_error_handler error_handler, const char *errors,
4884 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004885{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004886 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004887 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889
4890 Py_ssize_t startinpos;
4891 Py_ssize_t endinpos;
4892 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004893 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004894 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004895
4896 if (size == 0) {
4897 if (consumed)
4898 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004899 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004900 }
4901
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004902 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4903 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004904 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004905 *consumed = 1;
4906 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004907 }
4908
Victor Stinner8f674cc2013-04-17 23:02:17 +02004909 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004910 writer.min_length = size;
4911 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004912 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004913
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004914 writer.pos = ascii_decode(s, end, writer.data);
4915 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004916 while (s < end) {
4917 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004918 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004919
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004920 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004921 if (PyUnicode_IS_ASCII(writer.buffer))
4922 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004923 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004924 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004925 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004926 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004927 } else {
4928 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004929 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004930 }
4931
4932 switch (ch) {
4933 case 0:
4934 if (s == end || consumed)
4935 goto End;
4936 errmsg = "unexpected end of data";
4937 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004938 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004939 break;
4940 case 1:
4941 errmsg = "invalid start byte";
4942 startinpos = s - starts;
4943 endinpos = startinpos + 1;
4944 break;
4945 case 2:
Miss Islington (bot)d32594a2019-06-25 02:12:16 -07004946 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4947 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4948 {
4949 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004950 goto End;
4951 }
Miss Islington (bot)d32594a2019-06-25 02:12:16 -07004952 /* fall through */
4953 case 3:
4954 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 errmsg = "invalid continuation byte";
4956 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004957 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004958 break;
4959 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004960 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004961 goto onError;
4962 continue;
4963 }
4964
Victor Stinner1d65d912015-10-05 13:43:50 +02004965 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004966 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004967
4968 switch (error_handler) {
4969 case _Py_ERROR_IGNORE:
4970 s += (endinpos - startinpos);
4971 break;
4972
4973 case _Py_ERROR_REPLACE:
4974 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4975 goto onError;
4976 s += (endinpos - startinpos);
4977 break;
4978
4979 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004980 {
4981 Py_ssize_t i;
4982
Victor Stinner1d65d912015-10-05 13:43:50 +02004983 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4984 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004985 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004986 ch = (Py_UCS4)(unsigned char)(starts[i]);
4987 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4988 ch + 0xdc00);
4989 writer.pos++;
4990 }
4991 s += (endinpos - startinpos);
4992 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004993 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004994
4995 default:
4996 if (unicode_decode_call_errorhandler_writer(
4997 errors, &error_handler_obj,
4998 "utf-8", errmsg,
4999 &starts, &end, &startinpos, &endinpos, &exc, &s,
5000 &writer))
5001 goto onError;
5002 }
Victor Stinner785938e2011-12-11 20:09:03 +01005003 }
5004
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006 if (consumed)
5007 *consumed = s - starts;
5008
Victor Stinner1d65d912015-10-05 13:43:50 +02005009 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005011 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005012
5013onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005014 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005015 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005016 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005017 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005018}
5019
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005020
Victor Stinner709d23d2019-05-02 14:56:30 -04005021PyObject *
5022PyUnicode_DecodeUTF8Stateful(const char *s,
5023 Py_ssize_t size,
5024 const char *errors,
5025 Py_ssize_t *consumed)
5026{
5027 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5028}
5029
5030
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005031/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5032 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005033
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005034 On success, write a pointer to a newly allocated wide character string into
5035 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5036 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005037
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005038 On memory allocation failure, return -1.
5039
5040 On decoding error (if surrogateescape is zero), return -2. If wlen is
5041 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5042 is not NULL, write the decoding error message into *reason. */
5043int
5044_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005045 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005046{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005047 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005048 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005049 wchar_t *unicode;
5050 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005051
Victor Stinner3d4226a2018-08-29 22:21:32 +02005052 int surrogateescape = 0;
5053 int surrogatepass = 0;
5054 switch (errors)
5055 {
5056 case _Py_ERROR_STRICT:
5057 break;
5058 case _Py_ERROR_SURROGATEESCAPE:
5059 surrogateescape = 1;
5060 break;
5061 case _Py_ERROR_SURROGATEPASS:
5062 surrogatepass = 1;
5063 break;
5064 default:
5065 return -3;
5066 }
5067
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005068 /* Note: size will always be longer than the resulting Unicode
5069 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005070 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005071 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005072 }
5073
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005074 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005075 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005076 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005077 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005078
5079 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005080 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005082 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005084#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005086#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005087 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 if (ch > 0xFF) {
5090#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005091 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005092#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005093 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005094 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005095 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5096 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5097#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005099 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005100 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005101 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005102 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005103
5104 if (surrogateescape) {
5105 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5106 }
5107 else {
5108 /* Is it a valid three-byte code? */
5109 if (surrogatepass
5110 && (e - s) >= 3
5111 && (s[0] & 0xf0) == 0xe0
5112 && (s[1] & 0xc0) == 0x80
5113 && (s[2] & 0xc0) == 0x80)
5114 {
5115 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5116 s += 3;
5117 unicode[outpos++] = ch;
5118 }
5119 else {
5120 PyMem_RawFree(unicode );
5121 if (reason != NULL) {
5122 switch (ch) {
5123 case 0:
5124 *reason = "unexpected end of data";
5125 break;
5126 case 1:
5127 *reason = "invalid start byte";
5128 break;
5129 /* 2, 3, 4 */
5130 default:
5131 *reason = "invalid continuation byte";
5132 break;
5133 }
5134 }
5135 if (wlen != NULL) {
5136 *wlen = s - orig_s;
5137 }
5138 return -2;
5139 }
5140 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005142 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005143 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005144 if (wlen) {
5145 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005146 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005147 *wstr = unicode;
5148 return 0;
5149}
5150
Victor Stinner5f9cf232019-03-19 01:46:25 +01005151
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005152wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005153_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5154 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005155{
5156 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005157 int res = _Py_DecodeUTF8Ex(arg, arglen,
5158 &wstr, wlen,
5159 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005161 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5162 assert(res != -3);
5163 if (wlen) {
5164 *wlen = (size_t)res;
5165 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005166 return NULL;
5167 }
5168 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005169}
5170
Antoine Pitrouab868312009-01-10 15:40:25 +00005171
Victor Stinnere47e6982017-12-21 15:45:16 +01005172/* UTF-8 encoder using the surrogateescape error handler .
5173
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005174 On success, return 0 and write the newly allocated character string (use
5175 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005176
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005177 On encoding failure, return -2 and write the position of the invalid
5178 surrogate character into *error_pos (if error_pos is set) and the decoding
5179 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005180
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005181 On memory allocation failure, return -1. */
5182int
5183_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005184 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005185{
5186 const Py_ssize_t max_char_size = 4;
5187 Py_ssize_t len = wcslen(text);
5188
5189 assert(len >= 0);
5190
Victor Stinner3d4226a2018-08-29 22:21:32 +02005191 int surrogateescape = 0;
5192 int surrogatepass = 0;
5193 switch (errors)
5194 {
5195 case _Py_ERROR_STRICT:
5196 break;
5197 case _Py_ERROR_SURROGATEESCAPE:
5198 surrogateescape = 1;
5199 break;
5200 case _Py_ERROR_SURROGATEPASS:
5201 surrogatepass = 1;
5202 break;
5203 default:
5204 return -3;
5205 }
5206
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005207 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5208 return -1;
5209 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005210 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005211 if (raw_malloc) {
5212 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005213 }
5214 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005215 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005216 }
5217 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005218 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005219 }
5220
5221 char *p = bytes;
5222 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005223 for (i = 0; i < len; ) {
5224 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005225 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005226 i++;
5227#if Py_UNICODE_SIZE == 2
5228 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5229 && i < len
5230 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5231 {
5232 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5233 i++;
5234 }
5235#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005236
5237 if (ch < 0x80) {
5238 /* Encode ASCII */
5239 *p++ = (char) ch;
5240
5241 }
5242 else if (ch < 0x0800) {
5243 /* Encode Latin-1 */
5244 *p++ = (char)(0xc0 | (ch >> 6));
5245 *p++ = (char)(0x80 | (ch & 0x3f));
5246 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005247 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005248 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005249 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005250 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005251 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005252 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005253 if (reason != NULL) {
5254 *reason = "encoding error";
5255 }
5256 if (raw_malloc) {
5257 PyMem_RawFree(bytes);
5258 }
5259 else {
5260 PyMem_Free(bytes);
5261 }
5262 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005263 }
5264 *p++ = (char)(ch & 0xff);
5265 }
5266 else if (ch < 0x10000) {
5267 *p++ = (char)(0xe0 | (ch >> 12));
5268 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5269 *p++ = (char)(0x80 | (ch & 0x3f));
5270 }
5271 else { /* ch >= 0x10000 */
5272 assert(ch <= MAX_UNICODE);
5273 /* Encode UCS4 Unicode ordinals */
5274 *p++ = (char)(0xf0 | (ch >> 18));
5275 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5276 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5277 *p++ = (char)(0x80 | (ch & 0x3f));
5278 }
5279 }
5280 *p++ = '\0';
5281
5282 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005283 char *bytes2;
5284 if (raw_malloc) {
5285 bytes2 = PyMem_RawRealloc(bytes, final_size);
5286 }
5287 else {
5288 bytes2 = PyMem_Realloc(bytes, final_size);
5289 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005290 if (bytes2 == NULL) {
5291 if (error_pos != NULL) {
5292 *error_pos = (size_t)-1;
5293 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005294 if (raw_malloc) {
5295 PyMem_RawFree(bytes);
5296 }
5297 else {
5298 PyMem_Free(bytes);
5299 }
5300 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005301 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005302 *str = bytes2;
5303 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005304}
5305
5306
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005307/* Primary internal function which creates utf8 encoded bytes objects.
5308
5309 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005310 and allocate exactly as much space needed at the end. Else allocate the
5311 maximum possible needed (4 result bytes per Unicode character), and return
5312 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005313*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005314static PyObject *
5315unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5316 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005317{
Victor Stinner6099a032011-12-18 14:22:26 +01005318 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005319 void *data;
5320 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005321
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005322 if (!PyUnicode_Check(unicode)) {
5323 PyErr_BadArgument();
5324 return NULL;
5325 }
5326
5327 if (PyUnicode_READY(unicode) == -1)
5328 return NULL;
5329
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005330 if (PyUnicode_UTF8(unicode))
5331 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5332 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005333
5334 kind = PyUnicode_KIND(unicode);
5335 data = PyUnicode_DATA(unicode);
5336 size = PyUnicode_GET_LENGTH(unicode);
5337
Benjamin Petersonead6b532011-12-20 17:23:42 -06005338 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005339 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005340 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005341 case PyUnicode_1BYTE_KIND:
5342 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5343 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005344 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005345 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005346 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005347 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005348 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005349 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005350}
5351
Alexander Belopolsky40018472011-02-26 01:02:56 +00005352PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005353_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5354{
5355 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5356}
5357
5358
5359PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005360PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5361 Py_ssize_t size,
5362 const char *errors)
5363{
5364 PyObject *v, *unicode;
5365
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005366 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005367 if (unicode == NULL)
5368 return NULL;
5369 v = _PyUnicode_AsUTF8String(unicode, errors);
5370 Py_DECREF(unicode);
5371 return v;
5372}
5373
5374PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005375PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005377 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005378}
5379
Walter Dörwald41980ca2007-08-16 21:55:45 +00005380/* --- UTF-32 Codec ------------------------------------------------------- */
5381
5382PyObject *
5383PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005384 Py_ssize_t size,
5385 const char *errors,
5386 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005387{
5388 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5389}
5390
5391PyObject *
5392PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005393 Py_ssize_t size,
5394 const char *errors,
5395 int *byteorder,
5396 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005397{
5398 const char *starts = s;
5399 Py_ssize_t startinpos;
5400 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005401 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005402 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005403 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005404 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005405 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005406 PyObject *errorHandler = NULL;
5407 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005408
Walter Dörwald41980ca2007-08-16 21:55:45 +00005409 q = (unsigned char *)s;
5410 e = q + size;
5411
5412 if (byteorder)
5413 bo = *byteorder;
5414
5415 /* Check for BOM marks (U+FEFF) in the input and adjust current
5416 byte order setting accordingly. In native mode, the leading BOM
5417 mark is skipped, in all other modes, it is copied to the output
5418 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005419 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005420 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005421 if (bom == 0x0000FEFF) {
5422 bo = -1;
5423 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005424 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005425 else if (bom == 0xFFFE0000) {
5426 bo = 1;
5427 q += 4;
5428 }
5429 if (byteorder)
5430 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431 }
5432
Victor Stinnere64322e2012-10-30 23:12:47 +01005433 if (q == e) {
5434 if (consumed)
5435 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005436 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005437 }
5438
Victor Stinnere64322e2012-10-30 23:12:47 +01005439#ifdef WORDS_BIGENDIAN
5440 le = bo < 0;
5441#else
5442 le = bo <= 0;
5443#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005444 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005445
Victor Stinner8f674cc2013-04-17 23:02:17 +02005446 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005447 writer.min_length = (e - q + 3) / 4;
5448 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005449 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005450
Victor Stinnere64322e2012-10-30 23:12:47 +01005451 while (1) {
5452 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005453 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005454
Victor Stinnere64322e2012-10-30 23:12:47 +01005455 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005456 enum PyUnicode_Kind kind = writer.kind;
5457 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005458 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005459 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005460 if (le) {
5461 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005462 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005463 if (ch > maxch)
5464 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005465 if (kind != PyUnicode_1BYTE_KIND &&
5466 Py_UNICODE_IS_SURROGATE(ch))
5467 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005468 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005469 q += 4;
5470 } while (q <= last);
5471 }
5472 else {
5473 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005474 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005475 if (ch > maxch)
5476 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005477 if (kind != PyUnicode_1BYTE_KIND &&
5478 Py_UNICODE_IS_SURROGATE(ch))
5479 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005480 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005481 q += 4;
5482 } while (q <= last);
5483 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005484 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005485 }
5486
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005487 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005488 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005489 startinpos = ((const char *)q) - starts;
5490 endinpos = startinpos + 4;
5491 }
5492 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005493 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005494 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005495 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005497 startinpos = ((const char *)q) - starts;
5498 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005499 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005500 else {
5501 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005502 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005503 goto onError;
5504 q += 4;
5505 continue;
5506 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005507 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005508 startinpos = ((const char *)q) - starts;
5509 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005510 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005511
5512 /* The remaining input chars are ignored if the callback
5513 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005514 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005515 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005516 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005518 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005520 }
5521
Walter Dörwald41980ca2007-08-16 21:55:45 +00005522 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005524
Walter Dörwald41980ca2007-08-16 21:55:45 +00005525 Py_XDECREF(errorHandler);
5526 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005527 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005528
Benjamin Peterson29060642009-01-31 22:14:21 +00005529 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005530 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005531 Py_XDECREF(errorHandler);
5532 Py_XDECREF(exc);
5533 return NULL;
5534}
5535
5536PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537_PyUnicode_EncodeUTF32(PyObject *str,
5538 const char *errors,
5539 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005540{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005541 enum PyUnicode_Kind kind;
5542 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005543 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005544 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005545 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005546#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005547 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005548#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005549 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005550#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005551 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005552 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005553 PyObject *errorHandler = NULL;
5554 PyObject *exc = NULL;
5555 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005556
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005557 if (!PyUnicode_Check(str)) {
5558 PyErr_BadArgument();
5559 return NULL;
5560 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005561 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005562 return NULL;
5563 kind = PyUnicode_KIND(str);
5564 data = PyUnicode_DATA(str);
5565 len = PyUnicode_GET_LENGTH(str);
5566
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005567 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005568 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005569 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005570 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005571 if (v == NULL)
5572 return NULL;
5573
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005574 /* output buffer is 4-bytes aligned */
5575 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005576 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005577 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005578 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005579 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005580 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005581
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005582 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005583 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005584 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005585 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005586 else
5587 encoding = "utf-32";
5588
5589 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005590 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5591 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005592 }
5593
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005594 pos = 0;
5595 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005596 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005597
5598 if (kind == PyUnicode_2BYTE_KIND) {
5599 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5600 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005601 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005602 else {
5603 assert(kind == PyUnicode_4BYTE_KIND);
5604 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5605 &out, native_ordering);
5606 }
5607 if (pos == len)
5608 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005609
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005610 rep = unicode_encode_call_errorhandler(
5611 errors, &errorHandler,
5612 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005613 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005614 if (!rep)
5615 goto error;
5616
5617 if (PyBytes_Check(rep)) {
5618 repsize = PyBytes_GET_SIZE(rep);
5619 if (repsize & 3) {
5620 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005621 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005622 "surrogates not allowed");
5623 goto error;
5624 }
5625 moreunits = repsize / 4;
5626 }
5627 else {
5628 assert(PyUnicode_Check(rep));
5629 if (PyUnicode_READY(rep) < 0)
5630 goto error;
5631 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5632 if (!PyUnicode_IS_ASCII(rep)) {
5633 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005634 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005635 "surrogates not allowed");
5636 goto error;
5637 }
5638 }
5639
5640 /* four bytes are reserved for each surrogate */
5641 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005642 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005643 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005644 /* integer overflow */
5645 PyErr_NoMemory();
5646 goto error;
5647 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005648 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005649 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005650 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 }
5652
5653 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005654 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005655 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005656 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005657 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005658 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5659 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005660 }
5661
5662 Py_CLEAR(rep);
5663 }
5664
5665 /* Cut back to size actually needed. This is necessary for, for example,
5666 encoding of a string containing isolated surrogates and the 'ignore'
5667 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005668 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005669 if (nsize != PyBytes_GET_SIZE(v))
5670 _PyBytes_Resize(&v, nsize);
5671 Py_XDECREF(errorHandler);
5672 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005673 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005674 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005675 error:
5676 Py_XDECREF(rep);
5677 Py_XDECREF(errorHandler);
5678 Py_XDECREF(exc);
5679 Py_XDECREF(v);
5680 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005681}
5682
Alexander Belopolsky40018472011-02-26 01:02:56 +00005683PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005684PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5685 Py_ssize_t size,
5686 const char *errors,
5687 int byteorder)
5688{
5689 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005690 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005691 if (tmp == NULL)
5692 return NULL;
5693 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5694 Py_DECREF(tmp);
5695 return result;
5696}
5697
5698PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005699PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005700{
Victor Stinnerb960b342011-11-20 19:12:52 +01005701 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005702}
5703
Guido van Rossumd57fd912000-03-10 22:53:23 +00005704/* --- UTF-16 Codec ------------------------------------------------------- */
5705
Tim Peters772747b2001-08-09 22:21:55 +00005706PyObject *
5707PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005708 Py_ssize_t size,
5709 const char *errors,
5710 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005711{
Walter Dörwald69652032004-09-07 20:24:22 +00005712 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5713}
5714
5715PyObject *
5716PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005717 Py_ssize_t size,
5718 const char *errors,
5719 int *byteorder,
5720 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005721{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005722 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005723 Py_ssize_t startinpos;
5724 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005725 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005726 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005727 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005728 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005729 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005730 PyObject *errorHandler = NULL;
5731 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005732 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Tim Peters772747b2001-08-09 22:21:55 +00005734 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005735 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005736
5737 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005738 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005739
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005740 /* Check for BOM marks (U+FEFF) in the input and adjust current
5741 byte order setting accordingly. In native mode, the leading BOM
5742 mark is skipped, in all other modes, it is copied to the output
5743 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005744 if (bo == 0 && size >= 2) {
5745 const Py_UCS4 bom = (q[1] << 8) | q[0];
5746 if (bom == 0xFEFF) {
5747 q += 2;
5748 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005749 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005750 else if (bom == 0xFFFE) {
5751 q += 2;
5752 bo = 1;
5753 }
5754 if (byteorder)
5755 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005756 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005757
Antoine Pitrou63065d72012-05-15 23:48:04 +02005758 if (q == e) {
5759 if (consumed)
5760 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005761 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005762 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005763
Christian Heimes743e0cd2012-10-17 23:52:17 +02005764#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005765 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005766 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005767#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005768 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005770#endif
Tim Peters772747b2001-08-09 22:21:55 +00005771
Antoine Pitrou63065d72012-05-15 23:48:04 +02005772 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005773 character count normally. Error handler will take care of
5774 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005775 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005776 writer.min_length = (e - q + 1) / 2;
5777 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005778 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005779
Antoine Pitrou63065d72012-05-15 23:48:04 +02005780 while (1) {
5781 Py_UCS4 ch = 0;
5782 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005783 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005784 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005786 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005787 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005788 native_ordering);
5789 else
5790 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005791 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005792 native_ordering);
5793 } else if (kind == PyUnicode_2BYTE_KIND) {
5794 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005795 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005796 native_ordering);
5797 } else {
5798 assert(kind == PyUnicode_4BYTE_KIND);
5799 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005800 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005801 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005802 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005803 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005804
Antoine Pitrou63065d72012-05-15 23:48:04 +02005805 switch (ch)
5806 {
5807 case 0:
5808 /* remaining byte at the end? (size should be even) */
5809 if (q == e || consumed)
5810 goto End;
5811 errmsg = "truncated data";
5812 startinpos = ((const char *)q) - starts;
5813 endinpos = ((const char *)e) - starts;
5814 break;
5815 /* The remaining input chars are ignored if the callback
5816 chooses to skip the input */
5817 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005818 q -= 2;
5819 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005820 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005821 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005822 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005823 endinpos = ((const char *)e) - starts;
5824 break;
5825 case 2:
5826 errmsg = "illegal encoding";
5827 startinpos = ((const char *)q) - 2 - starts;
5828 endinpos = startinpos + 2;
5829 break;
5830 case 3:
5831 errmsg = "illegal UTF-16 surrogate";
5832 startinpos = ((const char *)q) - 4 - starts;
5833 endinpos = startinpos + 2;
5834 break;
5835 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005836 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005837 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005838 continue;
5839 }
5840
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005841 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005842 errors,
5843 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005845 &starts,
5846 (const char **)&e,
5847 &startinpos,
5848 &endinpos,
5849 &exc,
5850 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005852 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005853 }
5854
Antoine Pitrou63065d72012-05-15 23:48:04 +02005855End:
Walter Dörwald69652032004-09-07 20:24:22 +00005856 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005858
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005861 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005862
Benjamin Peterson29060642009-01-31 22:14:21 +00005863 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005864 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005865 Py_XDECREF(errorHandler);
5866 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005867 return NULL;
5868}
5869
Tim Peters772747b2001-08-09 22:21:55 +00005870PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871_PyUnicode_EncodeUTF16(PyObject *str,
5872 const char *errors,
5873 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005874{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005875 enum PyUnicode_Kind kind;
5876 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005877 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005878 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005879 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005880 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005881#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005882 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005883#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005884 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005885#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005886 const char *encoding;
5887 Py_ssize_t nsize, pos;
5888 PyObject *errorHandler = NULL;
5889 PyObject *exc = NULL;
5890 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005891
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 if (!PyUnicode_Check(str)) {
5893 PyErr_BadArgument();
5894 return NULL;
5895 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005896 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005897 return NULL;
5898 kind = PyUnicode_KIND(str);
5899 data = PyUnicode_DATA(str);
5900 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005901
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005903 if (kind == PyUnicode_4BYTE_KIND) {
5904 const Py_UCS4 *in = (const Py_UCS4 *)data;
5905 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005906 while (in < end) {
5907 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005908 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005909 }
5910 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005911 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005912 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005913 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005914 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005915 nsize = len + pairs + (byteorder == 0);
5916 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005917 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005918 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005919 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005920
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005921 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005922 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005923 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005924 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005925 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005926 }
5927 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005928 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005929 }
Tim Peters772747b2001-08-09 22:21:55 +00005930
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005931 if (kind == PyUnicode_1BYTE_KIND) {
5932 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5933 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005934 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005935
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005936 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005937 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005938 }
5939 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005940 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005941 }
5942 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005943 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005944 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005945
5946 pos = 0;
5947 while (pos < len) {
5948 Py_ssize_t repsize, moreunits;
5949
5950 if (kind == PyUnicode_2BYTE_KIND) {
5951 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5952 &out, native_ordering);
5953 }
5954 else {
5955 assert(kind == PyUnicode_4BYTE_KIND);
5956 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5957 &out, native_ordering);
5958 }
5959 if (pos == len)
5960 break;
5961
5962 rep = unicode_encode_call_errorhandler(
5963 errors, &errorHandler,
5964 encoding, "surrogates not allowed",
5965 str, &exc, pos, pos + 1, &pos);
5966 if (!rep)
5967 goto error;
5968
5969 if (PyBytes_Check(rep)) {
5970 repsize = PyBytes_GET_SIZE(rep);
5971 if (repsize & 1) {
5972 raise_encode_exception(&exc, encoding,
5973 str, pos - 1, pos,
5974 "surrogates not allowed");
5975 goto error;
5976 }
5977 moreunits = repsize / 2;
5978 }
5979 else {
5980 assert(PyUnicode_Check(rep));
5981 if (PyUnicode_READY(rep) < 0)
5982 goto error;
5983 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5984 if (!PyUnicode_IS_ASCII(rep)) {
5985 raise_encode_exception(&exc, encoding,
5986 str, pos - 1, pos,
5987 "surrogates not allowed");
5988 goto error;
5989 }
5990 }
5991
5992 /* two bytes are reserved for each surrogate */
5993 if (moreunits > 1) {
5994 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005995 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005996 /* integer overflow */
5997 PyErr_NoMemory();
5998 goto error;
5999 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006000 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 goto error;
6002 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6003 }
6004
6005 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006006 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006007 out += moreunits;
6008 } else /* rep is unicode */ {
6009 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6010 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6011 &out, native_ordering);
6012 }
6013
6014 Py_CLEAR(rep);
6015 }
6016
6017 /* Cut back to size actually needed. This is necessary for, for example,
6018 encoding of a string containing isolated surrogates and the 'ignore' handler
6019 is used. */
6020 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6021 if (nsize != PyBytes_GET_SIZE(v))
6022 _PyBytes_Resize(&v, nsize);
6023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006025 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006026 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006027 error:
6028 Py_XDECREF(rep);
6029 Py_XDECREF(errorHandler);
6030 Py_XDECREF(exc);
6031 Py_XDECREF(v);
6032 return NULL;
6033#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006034}
6035
Alexander Belopolsky40018472011-02-26 01:02:56 +00006036PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006037PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6038 Py_ssize_t size,
6039 const char *errors,
6040 int byteorder)
6041{
6042 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006043 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006044 if (tmp == NULL)
6045 return NULL;
6046 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6047 Py_DECREF(tmp);
6048 return result;
6049}
6050
6051PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006052PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006053{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006054 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006055}
6056
6057/* --- Unicode Escape Codec ----------------------------------------------- */
6058
Fredrik Lundh06d12682001-01-24 07:59:11 +00006059static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006060
Alexander Belopolsky40018472011-02-26 01:02:56 +00006061PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006062_PyUnicode_DecodeUnicodeEscape(const char *s,
6063 Py_ssize_t size,
6064 const char *errors,
6065 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006066{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006067 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006068 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006069 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006070 PyObject *errorHandler = NULL;
6071 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006072
Eric V. Smith42454af2016-10-31 09:22:08 -04006073 // so we can remember if we've seen an invalid escape char or not
6074 *first_invalid_escape = NULL;
6075
Victor Stinner62ec3312016-09-06 17:04:34 -07006076 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006077 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006078 }
6079 /* Escaped strings will always be longer than the resulting
6080 Unicode string, so we start with size here and then reduce the
6081 length after conversion to the true value.
6082 (but if the error callback returns a long replacement string
6083 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006084 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006085 writer.min_length = size;
6086 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6087 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006088 }
6089
Guido van Rossumd57fd912000-03-10 22:53:23 +00006090 end = s + size;
6091 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006092 unsigned char c = (unsigned char) *s++;
6093 Py_UCS4 ch;
6094 int count;
6095 Py_ssize_t startinpos;
6096 Py_ssize_t endinpos;
6097 const char *message;
6098
6099#define WRITE_ASCII_CHAR(ch) \
6100 do { \
6101 assert(ch <= 127); \
6102 assert(writer.pos < writer.size); \
6103 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6104 } while(0)
6105
6106#define WRITE_CHAR(ch) \
6107 do { \
6108 if (ch <= writer.maxchar) { \
6109 assert(writer.pos < writer.size); \
6110 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6111 } \
6112 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6113 goto onError; \
6114 } \
6115 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006116
6117 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006118 if (c != '\\') {
6119 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006120 continue;
6121 }
6122
Victor Stinner62ec3312016-09-06 17:04:34 -07006123 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006124 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006125 if (s >= end) {
6126 message = "\\ at end of string";
6127 goto error;
6128 }
6129 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006130
Victor Stinner62ec3312016-09-06 17:04:34 -07006131 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006132 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006133
Benjamin Peterson29060642009-01-31 22:14:21 +00006134 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006135 case '\n': continue;
6136 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6137 case '\'': WRITE_ASCII_CHAR('\''); continue;
6138 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6139 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006140 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006141 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6142 case 't': WRITE_ASCII_CHAR('\t'); continue;
6143 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6144 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006145 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006146 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006147 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006148 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149
Benjamin Peterson29060642009-01-31 22:14:21 +00006150 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006151 case '0': case '1': case '2': case '3':
6152 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006153 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006154 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006155 ch = (ch<<3) + *s++ - '0';
6156 if (s < end && '0' <= *s && *s <= '7') {
6157 ch = (ch<<3) + *s++ - '0';
6158 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 WRITE_CHAR(ch);
6161 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006162
Benjamin Peterson29060642009-01-31 22:14:21 +00006163 /* hex escapes */
6164 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006166 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006167 message = "truncated \\xXX escape";
6168 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006171 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006173 message = "truncated \\uXXXX escape";
6174 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006175
Benjamin Peterson29060642009-01-31 22:14:21 +00006176 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006177 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006178 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006179 message = "truncated \\UXXXXXXXX escape";
6180 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006181 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006182 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006183 ch <<= 4;
6184 if (c >= '0' && c <= '9') {
6185 ch += c - '0';
6186 }
6187 else if (c >= 'a' && c <= 'f') {
6188 ch += c - ('a' - 10);
6189 }
6190 else if (c >= 'A' && c <= 'F') {
6191 ch += c - ('A' - 10);
6192 }
6193 else {
6194 break;
6195 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006196 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006198 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006199 }
6200
6201 /* when we get here, ch is a 32-bit unicode character */
6202 if (ch > MAX_UNICODE) {
6203 message = "illegal Unicode character";
6204 goto error;
6205 }
6206
6207 WRITE_CHAR(ch);
6208 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006209
Benjamin Peterson29060642009-01-31 22:14:21 +00006210 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006211 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006212 if (ucnhash_CAPI == NULL) {
6213 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006214 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6215 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006216 if (ucnhash_CAPI == NULL) {
6217 PyErr_SetString(
6218 PyExc_UnicodeError,
6219 "\\N escapes not supported (can't load unicodedata module)"
6220 );
6221 goto onError;
6222 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006223 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006224
6225 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006226 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006227 const char *start = ++s;
6228 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006229 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006231 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006232 namelen = s - start;
6233 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006234 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006235 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006236 ch = 0xffffffff; /* in case 'getcode' messes up */
6237 if (namelen <= INT_MAX &&
6238 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6239 &ch, 0)) {
6240 assert(ch <= MAX_UNICODE);
6241 WRITE_CHAR(ch);
6242 continue;
6243 }
6244 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006245 }
6246 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006247 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006248
6249 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006250 if (*first_invalid_escape == NULL) {
6251 *first_invalid_escape = s-1; /* Back up one char, since we've
6252 already incremented s. */
6253 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006254 WRITE_ASCII_CHAR('\\');
6255 WRITE_CHAR(c);
6256 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006257 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006258
6259 error:
6260 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006261 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006262 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006263 errors, &errorHandler,
6264 "unicodeescape", message,
6265 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006266 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006267 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006268 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006269 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006270
6271#undef WRITE_ASCII_CHAR
6272#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006274
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006275 Py_XDECREF(errorHandler);
6276 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006277 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006278
Benjamin Peterson29060642009-01-31 22:14:21 +00006279 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006280 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006281 Py_XDECREF(errorHandler);
6282 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 return NULL;
6284}
6285
Eric V. Smith42454af2016-10-31 09:22:08 -04006286PyObject *
6287PyUnicode_DecodeUnicodeEscape(const char *s,
6288 Py_ssize_t size,
6289 const char *errors)
6290{
6291 const char *first_invalid_escape;
6292 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6293 &first_invalid_escape);
6294 if (result == NULL)
6295 return NULL;
6296 if (first_invalid_escape != NULL) {
6297 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6298 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006299 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006300 Py_DECREF(result);
6301 return NULL;
6302 }
6303 }
6304 return result;
6305}
6306
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006307/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
Alexander Belopolsky40018472011-02-26 01:02:56 +00006309PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006310PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006311{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006312 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006313 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006314 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006315 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006316 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006317 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006318
Ezio Melottie7f90372012-10-05 03:33:31 +03006319 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006320 escape.
6321
Ezio Melottie7f90372012-10-05 03:33:31 +03006322 For UCS1 strings it's '\xxx', 4 bytes per source character.
6323 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6324 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006325 */
6326
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006327 if (!PyUnicode_Check(unicode)) {
6328 PyErr_BadArgument();
6329 return NULL;
6330 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006331 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006332 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006333 }
Victor Stinner358af132015-10-12 22:36:57 +02006334
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006335 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 if (len == 0) {
6337 return PyBytes_FromStringAndSize(NULL, 0);
6338 }
6339
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006340 kind = PyUnicode_KIND(unicode);
6341 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006342 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6343 bytes, and 1 byte characters 4. */
6344 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006345 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006346 return PyErr_NoMemory();
6347 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006348 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006349 if (repr == NULL) {
6350 return NULL;
6351 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006352
Victor Stinner62ec3312016-09-06 17:04:34 -07006353 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006354 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006355 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006356
Victor Stinner62ec3312016-09-06 17:04:34 -07006357 /* U+0000-U+00ff range */
6358 if (ch < 0x100) {
6359 if (ch >= ' ' && ch < 127) {
6360 if (ch != '\\') {
6361 /* Copy printable US ASCII as-is */
6362 *p++ = (char) ch;
6363 }
6364 /* Escape backslashes */
6365 else {
6366 *p++ = '\\';
6367 *p++ = '\\';
6368 }
6369 }
Victor Stinner358af132015-10-12 22:36:57 +02006370
Victor Stinner62ec3312016-09-06 17:04:34 -07006371 /* Map special whitespace to '\t', \n', '\r' */
6372 else if (ch == '\t') {
6373 *p++ = '\\';
6374 *p++ = 't';
6375 }
6376 else if (ch == '\n') {
6377 *p++ = '\\';
6378 *p++ = 'n';
6379 }
6380 else if (ch == '\r') {
6381 *p++ = '\\';
6382 *p++ = 'r';
6383 }
6384
6385 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6386 else {
6387 *p++ = '\\';
6388 *p++ = 'x';
6389 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6390 *p++ = Py_hexdigits[ch & 0x000F];
6391 }
Tim Petersced69f82003-09-16 20:30:58 +00006392 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006393 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006394 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 *p++ = '\\';
6396 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006397 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6398 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6399 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6400 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006401 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6403 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006404
Victor Stinner62ec3312016-09-06 17:04:34 -07006405 /* Make sure that the first two digits are zero */
6406 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006407 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006408 *p++ = 'U';
6409 *p++ = '0';
6410 *p++ = '0';
6411 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6412 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6413 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6414 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6415 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6416 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006418 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419
Victor Stinner62ec3312016-09-06 17:04:34 -07006420 assert(p - PyBytes_AS_STRING(repr) > 0);
6421 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6422 return NULL;
6423 }
6424 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006425}
6426
Alexander Belopolsky40018472011-02-26 01:02:56 +00006427PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6429 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006431 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006432 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006433 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006435 }
6436
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006437 result = PyUnicode_AsUnicodeEscapeString(tmp);
6438 Py_DECREF(tmp);
6439 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006440}
6441
6442/* --- Raw Unicode Escape Codec ------------------------------------------- */
6443
Alexander Belopolsky40018472011-02-26 01:02:56 +00006444PyObject *
6445PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006446 Py_ssize_t size,
6447 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006448{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006449 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006450 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006451 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006452 PyObject *errorHandler = NULL;
6453 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006454
Victor Stinner62ec3312016-09-06 17:04:34 -07006455 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006456 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006457 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 /* Escaped strings will always be longer than the resulting
6460 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006461 length after conversion to the true value. (But decoding error
6462 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006463 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 writer.min_length = size;
6465 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6466 goto onError;
6467 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006468
Guido van Rossumd57fd912000-03-10 22:53:23 +00006469 end = s + size;
6470 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006471 unsigned char c = (unsigned char) *s++;
6472 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006473 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006474 Py_ssize_t startinpos;
6475 Py_ssize_t endinpos;
6476 const char *message;
6477
6478#define WRITE_CHAR(ch) \
6479 do { \
6480 if (ch <= writer.maxchar) { \
6481 assert(writer.pos < writer.size); \
6482 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6483 } \
6484 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6485 goto onError; \
6486 } \
6487 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006488
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 if (c != '\\' || s >= end) {
6491 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006492 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006493 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006494
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 c = (unsigned char) *s++;
6496 if (c == 'u') {
6497 count = 4;
6498 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006499 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 else if (c == 'U') {
6501 count = 8;
6502 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006503 }
6504 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006505 assert(writer.pos < writer.size);
6506 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6507 WRITE_CHAR(c);
6508 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006509 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006510 startinpos = s - starts - 2;
6511
6512 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6513 for (ch = 0; count && s < end; ++s, --count) {
6514 c = (unsigned char)*s;
6515 ch <<= 4;
6516 if (c >= '0' && c <= '9') {
6517 ch += c - '0';
6518 }
6519 else if (c >= 'a' && c <= 'f') {
6520 ch += c - ('a' - 10);
6521 }
6522 else if (c >= 'A' && c <= 'F') {
6523 ch += c - ('A' - 10);
6524 }
6525 else {
6526 break;
6527 }
6528 }
6529 if (!count) {
6530 if (ch <= MAX_UNICODE) {
6531 WRITE_CHAR(ch);
6532 continue;
6533 }
6534 message = "\\Uxxxxxxxx out of range";
6535 }
6536
6537 endinpos = s-starts;
6538 writer.min_length = end - s + writer.pos;
6539 if (unicode_decode_call_errorhandler_writer(
6540 errors, &errorHandler,
6541 "rawunicodeescape", message,
6542 &starts, &end, &startinpos, &endinpos, &exc, &s,
6543 &writer)) {
6544 goto onError;
6545 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006546 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006547
6548#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006549 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 Py_XDECREF(errorHandler);
6551 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006552 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006553
Benjamin Peterson29060642009-01-31 22:14:21 +00006554 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006555 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006556 Py_XDECREF(errorHandler);
6557 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006558 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006559
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560}
6561
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006562
Alexander Belopolsky40018472011-02-26 01:02:56 +00006563PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006564PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006565{
Victor Stinner62ec3312016-09-06 17:04:34 -07006566 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006567 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006568 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006569 int kind;
6570 void *data;
6571 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006572
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006573 if (!PyUnicode_Check(unicode)) {
6574 PyErr_BadArgument();
6575 return NULL;
6576 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006578 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006579 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006580 kind = PyUnicode_KIND(unicode);
6581 data = PyUnicode_DATA(unicode);
6582 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006583 if (kind == PyUnicode_1BYTE_KIND) {
6584 return PyBytes_FromStringAndSize(data, len);
6585 }
Victor Stinner0e368262011-11-10 20:12:49 +01006586
Victor Stinner62ec3312016-09-06 17:04:34 -07006587 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6588 bytes, and 1 byte characters 4. */
6589 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006590
Victor Stinner62ec3312016-09-06 17:04:34 -07006591 if (len > PY_SSIZE_T_MAX / expandsize) {
6592 return PyErr_NoMemory();
6593 }
6594 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6595 if (repr == NULL) {
6596 return NULL;
6597 }
6598 if (len == 0) {
6599 return repr;
6600 }
6601
6602 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006603 for (pos = 0; pos < len; pos++) {
6604 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006605
Victor Stinner62ec3312016-09-06 17:04:34 -07006606 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6607 if (ch < 0x100) {
6608 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006609 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006610 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006611 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 *p++ = '\\';
6613 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006614 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6615 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6616 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6617 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006618 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006619 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6620 else {
6621 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6622 *p++ = '\\';
6623 *p++ = 'U';
6624 *p++ = '0';
6625 *p++ = '0';
6626 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6627 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6628 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6629 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6630 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6631 *p++ = Py_hexdigits[ch & 15];
6632 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006633 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006634
Victor Stinner62ec3312016-09-06 17:04:34 -07006635 assert(p > PyBytes_AS_STRING(repr));
6636 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6637 return NULL;
6638 }
6639 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006640}
6641
Alexander Belopolsky40018472011-02-26 01:02:56 +00006642PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006643PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6644 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006645{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006646 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006647 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006648 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006649 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006650 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6651 Py_DECREF(tmp);
6652 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006653}
6654
6655/* --- Latin-1 Codec ------------------------------------------------------ */
6656
Alexander Belopolsky40018472011-02-26 01:02:56 +00006657PyObject *
6658PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006659 Py_ssize_t size,
6660 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006662 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006663 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006664}
6665
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006666/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006667static void
6668make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006669 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006670 PyObject *unicode,
6671 Py_ssize_t startpos, Py_ssize_t endpos,
6672 const char *reason)
6673{
6674 if (*exceptionObject == NULL) {
6675 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006676 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006677 encoding, unicode, startpos, endpos, reason);
6678 }
6679 else {
6680 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6681 goto onError;
6682 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6683 goto onError;
6684 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6685 goto onError;
6686 return;
6687 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006688 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006689 }
6690}
6691
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006692/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006693static void
6694raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006695 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006696 PyObject *unicode,
6697 Py_ssize_t startpos, Py_ssize_t endpos,
6698 const char *reason)
6699{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006700 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006701 encoding, unicode, startpos, endpos, reason);
6702 if (*exceptionObject != NULL)
6703 PyCodec_StrictErrors(*exceptionObject);
6704}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006705
6706/* error handling callback helper:
6707 build arguments, call the callback and check the arguments,
6708 put the result into newpos and return the replacement string, which
6709 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006710static PyObject *
6711unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006712 PyObject **errorHandler,
6713 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006714 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006715 Py_ssize_t startpos, Py_ssize_t endpos,
6716 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006718 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006719 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006720 PyObject *restuple;
6721 PyObject *resunicode;
6722
6723 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006724 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727 }
6728
Benjamin Petersonbac79492012-01-14 13:34:47 -05006729 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006730 return NULL;
6731 len = PyUnicode_GET_LENGTH(unicode);
6732
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006733 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006734 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006735 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006736 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006737
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006738 restuple = PyObject_CallFunctionObjArgs(
6739 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006741 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006742 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006743 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006744 Py_DECREF(restuple);
6745 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006746 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006747 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006748 &resunicode, newpos)) {
6749 Py_DECREF(restuple);
6750 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006752 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6753 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6754 Py_DECREF(restuple);
6755 return NULL;
6756 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006757 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006758 *newpos = len + *newpos;
6759 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006760 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006761 Py_DECREF(restuple);
6762 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006763 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006764 Py_INCREF(resunicode);
6765 Py_DECREF(restuple);
6766 return resunicode;
6767}
6768
Alexander Belopolsky40018472011-02-26 01:02:56 +00006769static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006770unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006771 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006772 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006774 /* input state */
6775 Py_ssize_t pos=0, size;
6776 int kind;
6777 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778 /* pointer into the output */
6779 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006780 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6781 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006782 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006783 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006784 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006785 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006786 /* output object */
6787 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006788
Benjamin Petersonbac79492012-01-14 13:34:47 -05006789 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006790 return NULL;
6791 size = PyUnicode_GET_LENGTH(unicode);
6792 kind = PyUnicode_KIND(unicode);
6793 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006794 /* allocate enough for a simple encoding without
6795 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006796 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006797 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006798
6799 _PyBytesWriter_Init(&writer);
6800 str = _PyBytesWriter_Alloc(&writer, size);
6801 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006802 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006803
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006804 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006805 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006806
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006808 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006810 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006811 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006812 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006813 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006814 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006816 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006817 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006818 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006819
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006820 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006821 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006822
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006823 /* Only overallocate the buffer if it's not the last write */
6824 writer.overallocate = (collend < size);
6825
Benjamin Peterson29060642009-01-31 22:14:21 +00006826 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006827 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006828 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006829
6830 switch (error_handler) {
6831 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006832 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006833 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006834
6835 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006836 memset(str, '?', collend - collstart);
6837 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006838 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006839 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006840 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006841 break;
Victor Stinner50149202015-09-22 00:26:54 +02006842
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006843 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006844 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006845 writer.min_size -= (collend - collstart);
6846 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006847 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006848 if (str == NULL)
6849 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006850 pos = collend;
6851 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006852
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006853 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006854 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006855 writer.min_size -= (collend - collstart);
6856 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006857 unicode, collstart, collend);
6858 if (str == NULL)
6859 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006860 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006861 break;
Victor Stinner50149202015-09-22 00:26:54 +02006862
Victor Stinnerc3713e92015-09-29 12:32:13 +02006863 case _Py_ERROR_SURROGATEESCAPE:
6864 for (i = collstart; i < collend; ++i) {
6865 ch = PyUnicode_READ(kind, data, i);
6866 if (ch < 0xdc80 || 0xdcff < ch) {
6867 /* Not a UTF-8b surrogate */
6868 break;
6869 }
6870 *str++ = (char)(ch - 0xdc00);
6871 ++pos;
6872 }
6873 if (i >= collend)
6874 break;
6875 collstart = pos;
6876 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006877 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006878
Benjamin Peterson29060642009-01-31 22:14:21 +00006879 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006880 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6881 encoding, reason, unicode, &exc,
6882 collstart, collend, &newpos);
6883 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006884 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006885
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006886 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006887 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006888
Victor Stinner6bd525b2015-10-09 13:10:05 +02006889 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006890 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006891 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006892 PyBytes_AS_STRING(rep),
6893 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006894 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006895 else {
6896 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006897
Victor Stinner6bd525b2015-10-09 13:10:05 +02006898 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006900
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006901 if (limit == 256 ?
6902 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6903 !PyUnicode_IS_ASCII(rep))
6904 {
6905 /* Not all characters are smaller than limit */
6906 raise_encode_exception(&exc, encoding, unicode,
6907 collstart, collend, reason);
6908 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006909 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006910 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6911 str = _PyBytesWriter_WriteBytes(&writer, str,
6912 PyUnicode_DATA(rep),
6913 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006914 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006915 if (str == NULL)
6916 goto onError;
6917
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006918 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006919 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006920 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006921
6922 /* If overallocation was disabled, ensure that it was the last
6923 write. Otherwise, we missed an optimization */
6924 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006925 }
6926 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006927
Victor Stinner50149202015-09-22 00:26:54 +02006928 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006929 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006930 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006931
6932 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006933 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006934 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006935 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006936 Py_XDECREF(exc);
6937 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006938}
6939
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006940/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006941PyObject *
6942PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006943 Py_ssize_t size,
6944 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006945{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006946 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006947 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006948 if (unicode == NULL)
6949 return NULL;
6950 result = unicode_encode_ucs1(unicode, errors, 256);
6951 Py_DECREF(unicode);
6952 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006953}
6954
Alexander Belopolsky40018472011-02-26 01:02:56 +00006955PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006956_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006957{
6958 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006959 PyErr_BadArgument();
6960 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006961 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006962 if (PyUnicode_READY(unicode) == -1)
6963 return NULL;
6964 /* Fast path: if it is a one-byte string, construct
6965 bytes object directly. */
6966 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6967 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6968 PyUnicode_GET_LENGTH(unicode));
6969 /* Non-Latin-1 characters present. Defer to above function to
6970 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006971 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006972}
6973
6974PyObject*
6975PyUnicode_AsLatin1String(PyObject *unicode)
6976{
6977 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006978}
6979
6980/* --- 7-bit ASCII Codec -------------------------------------------------- */
6981
Alexander Belopolsky40018472011-02-26 01:02:56 +00006982PyObject *
6983PyUnicode_DecodeASCII(const char *s,
6984 Py_ssize_t size,
6985 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006986{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006987 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006988 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006989 int kind;
6990 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006991 Py_ssize_t startinpos;
6992 Py_ssize_t endinpos;
6993 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006994 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006995 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006996 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006997 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006998
Guido van Rossumd57fd912000-03-10 22:53:23 +00006999 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007000 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007001
Guido van Rossumd57fd912000-03-10 22:53:23 +00007002 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007003 if (size == 1 && (unsigned char)s[0] < 128)
7004 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007005
Victor Stinner8f674cc2013-04-17 23:02:17 +02007006 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007007 writer.min_length = size;
7008 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02007009 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007010
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007011 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007013 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007014 writer.pos = outpos;
7015 if (writer.pos == size)
7016 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007017
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007018 s += writer.pos;
7019 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007020 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007021 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007022 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007023 PyUnicode_WRITE(kind, data, writer.pos, c);
7024 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007026 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007027 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007028
7029 /* byte outsize range 0x00..0x7f: call the error handler */
7030
7031 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007032 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007033
7034 switch (error_handler)
7035 {
7036 case _Py_ERROR_REPLACE:
7037 case _Py_ERROR_SURROGATEESCAPE:
7038 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007039 but we may switch to UCS2 at the first write */
7040 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7041 goto onError;
7042 kind = writer.kind;
7043 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007044
7045 if (error_handler == _Py_ERROR_REPLACE)
7046 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7047 else
7048 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7049 writer.pos++;
7050 ++s;
7051 break;
7052
7053 case _Py_ERROR_IGNORE:
7054 ++s;
7055 break;
7056
7057 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 startinpos = s-starts;
7059 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007060 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007061 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 "ascii", "ordinal not in range(128)",
7063 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007064 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007065 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007066 kind = writer.kind;
7067 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007069 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007070 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007072 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007073
Benjamin Peterson29060642009-01-31 22:14:21 +00007074 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007075 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007076 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007077 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007078 return NULL;
7079}
7080
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007081/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007082PyObject *
7083PyUnicode_EncodeASCII(const Py_UNICODE *p,
7084 Py_ssize_t size,
7085 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007086{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007087 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007088 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007089 if (unicode == NULL)
7090 return NULL;
7091 result = unicode_encode_ucs1(unicode, errors, 128);
7092 Py_DECREF(unicode);
7093 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007094}
7095
Alexander Belopolsky40018472011-02-26 01:02:56 +00007096PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007097_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007098{
7099 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007100 PyErr_BadArgument();
7101 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007102 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007103 if (PyUnicode_READY(unicode) == -1)
7104 return NULL;
7105 /* Fast path: if it is an ASCII-only string, construct bytes object
7106 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007107 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007108 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7109 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007110 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007111}
7112
7113PyObject *
7114PyUnicode_AsASCIIString(PyObject *unicode)
7115{
7116 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007117}
7118
Steve Dowercc16be82016-09-08 10:35:16 -07007119#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007120
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007121/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007122
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007123#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007124#define NEED_RETRY
7125#endif
7126
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07007127/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7128 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7129 both cases also and avoids partial characters overrunning the
7130 length limit in MultiByteToWideChar on Windows */
7131#define DECODING_CHUNK_SIZE (INT_MAX/4)
7132
Victor Stinner3a50e702011-10-18 21:21:00 +02007133#ifndef WC_ERR_INVALID_CHARS
7134# define WC_ERR_INVALID_CHARS 0x0080
7135#endif
7136
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007137static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007138code_page_name(UINT code_page, PyObject **obj)
7139{
7140 *obj = NULL;
7141 if (code_page == CP_ACP)
7142 return "mbcs";
7143 if (code_page == CP_UTF7)
7144 return "CP_UTF7";
7145 if (code_page == CP_UTF8)
7146 return "CP_UTF8";
7147
7148 *obj = PyBytes_FromFormat("cp%u", code_page);
7149 if (*obj == NULL)
7150 return NULL;
7151 return PyBytes_AS_STRING(*obj);
7152}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153
Victor Stinner3a50e702011-10-18 21:21:00 +02007154static DWORD
7155decode_code_page_flags(UINT code_page)
7156{
7157 if (code_page == CP_UTF7) {
7158 /* The CP_UTF7 decoder only supports flags=0 */
7159 return 0;
7160 }
7161 else
7162 return MB_ERR_INVALID_CHARS;
7163}
7164
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007165/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007166 * Decode a byte string from a Windows code page into unicode object in strict
7167 * mode.
7168 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007169 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7170 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007171 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007172static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007173decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007174 wchar_t **buf,
7175 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007176 const char *in,
7177 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007178{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007179 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007180 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007181 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007182
7183 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007184 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007185 while ((outsize = MultiByteToWideChar(code_page, flags,
7186 in, insize, NULL, 0)) <= 0)
7187 {
7188 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7189 goto error;
7190 }
7191 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7192 flags = 0;
7193 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007194
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007195 /* Extend a wchar_t* buffer */
7196 Py_ssize_t n = *bufsize; /* Get the current length */
7197 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7198 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007199 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007200 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201
7202 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007203 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7204 if (outsize <= 0)
7205 goto error;
7206 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007207
Victor Stinner3a50e702011-10-18 21:21:00 +02007208error:
7209 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7210 return -2;
7211 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007212 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007213}
7214
Victor Stinner3a50e702011-10-18 21:21:00 +02007215/*
7216 * Decode a byte string from a code page into unicode object with an error
7217 * handler.
7218 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007219 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 * UnicodeDecodeError exception and returns -1 on error.
7221 */
7222static int
7223decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007224 wchar_t **buf,
7225 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007226 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007227 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007228{
7229 const char *startin = in;
7230 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007231 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007232 /* Ideally, we should get reason from FormatMessage. This is the Windows
7233 2000 English version of the message. */
7234 const char *reason = "No mapping for the Unicode character exists "
7235 "in the target code page.";
7236 /* each step cannot decode more than 1 character, but a character can be
7237 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007238 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007239 int insize;
7240 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007241 PyObject *errorHandler = NULL;
7242 PyObject *exc = NULL;
7243 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007244 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007245 DWORD err;
7246 int ret = -1;
7247
7248 assert(size > 0);
7249
7250 encoding = code_page_name(code_page, &encoding_obj);
7251 if (encoding == NULL)
7252 return -1;
7253
Victor Stinner7d00cc12014-03-17 23:08:06 +01007254 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007255 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7256 UnicodeDecodeError. */
7257 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7258 if (exc != NULL) {
7259 PyCodec_StrictErrors(exc);
7260 Py_CLEAR(exc);
7261 }
7262 goto error;
7263 }
7264
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007265 /* Extend a wchar_t* buffer */
7266 Py_ssize_t n = *bufsize; /* Get the current length */
7267 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7268 PyErr_NoMemory();
7269 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007270 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007271 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7272 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007273 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007274 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007275
7276 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007277 while (in < endin)
7278 {
7279 /* Decode a character */
7280 insize = 1;
7281 do
7282 {
7283 outsize = MultiByteToWideChar(code_page, flags,
7284 in, insize,
7285 buffer, Py_ARRAY_LENGTH(buffer));
7286 if (outsize > 0)
7287 break;
7288 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007289 if (err == ERROR_INVALID_FLAGS && flags) {
7290 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7291 flags = 0;
7292 continue;
7293 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 if (err != ERROR_NO_UNICODE_TRANSLATION
7295 && err != ERROR_INSUFFICIENT_BUFFER)
7296 {
7297 PyErr_SetFromWindowsErr(0);
7298 goto error;
7299 }
7300 insize++;
7301 }
7302 /* 4=maximum length of a UTF-8 sequence */
7303 while (insize <= 4 && (in + insize) <= endin);
7304
7305 if (outsize <= 0) {
7306 Py_ssize_t startinpos, endinpos, outpos;
7307
Victor Stinner7d00cc12014-03-17 23:08:06 +01007308 /* last character in partial decode? */
7309 if (in + insize >= endin && !final)
7310 break;
7311
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 startinpos = in - startin;
7313 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007314 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007315 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007316 errors, &errorHandler,
7317 encoding, reason,
7318 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007319 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007320 {
7321 goto error;
7322 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007323 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007324 }
7325 else {
7326 in += insize;
7327 memcpy(out, buffer, outsize * sizeof(wchar_t));
7328 out += outsize;
7329 }
7330 }
7331
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007332 /* Shrink the buffer */
7333 assert(out - *buf <= *bufsize);
7334 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007335 /* (in - startin) <= size and size is an int */
7336 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007337
7338error:
7339 Py_XDECREF(encoding_obj);
7340 Py_XDECREF(errorHandler);
7341 Py_XDECREF(exc);
7342 return ret;
7343}
7344
Victor Stinner3a50e702011-10-18 21:21:00 +02007345static PyObject *
7346decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007347 const char *s, Py_ssize_t size,
7348 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007350 wchar_t *buf = NULL;
7351 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007352 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007353
Victor Stinner3a50e702011-10-18 21:21:00 +02007354 if (code_page < 0) {
7355 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7356 return NULL;
7357 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007358 if (size < 0) {
7359 PyErr_BadInternalCall();
7360 return NULL;
7361 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007362
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007364 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365
Victor Stinner76a31a62011-11-04 00:05:13 +01007366 do
7367 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007368#ifdef NEED_RETRY
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07007369 if (size > DECODING_CHUNK_SIZE) {
7370 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007371 final = 0;
7372 done = 0;
7373 }
7374 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 {
7377 chunk_size = (int)size;
7378 final = (consumed == NULL);
7379 done = 1;
7380 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007381
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007383 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007384 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007385 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007386 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007387
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007388 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007389 s, chunk_size);
7390 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007391 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007392 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007393 errors, final);
7394 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007395
7396 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007397 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007398 return NULL;
7399 }
7400
7401 if (consumed)
7402 *consumed += converted;
7403
7404 s += converted;
7405 size -= converted;
7406 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007407
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007408 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7409 PyMem_Free(buf);
7410 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007411}
7412
Alexander Belopolsky40018472011-02-26 01:02:56 +00007413PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007414PyUnicode_DecodeCodePageStateful(int code_page,
7415 const char *s,
7416 Py_ssize_t size,
7417 const char *errors,
7418 Py_ssize_t *consumed)
7419{
7420 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7421}
7422
7423PyObject *
7424PyUnicode_DecodeMBCSStateful(const char *s,
7425 Py_ssize_t size,
7426 const char *errors,
7427 Py_ssize_t *consumed)
7428{
7429 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7430}
7431
7432PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007433PyUnicode_DecodeMBCS(const char *s,
7434 Py_ssize_t size,
7435 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007436{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007437 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7438}
7439
Victor Stinner3a50e702011-10-18 21:21:00 +02007440static DWORD
7441encode_code_page_flags(UINT code_page, const char *errors)
7442{
7443 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007444 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007445 }
7446 else if (code_page == CP_UTF7) {
7447 /* CP_UTF7 only supports flags=0 */
7448 return 0;
7449 }
7450 else {
7451 if (errors != NULL && strcmp(errors, "replace") == 0)
7452 return 0;
7453 else
7454 return WC_NO_BEST_FIT_CHARS;
7455 }
7456}
7457
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 * Encode a Unicode string to a Windows code page into a byte string in strict
7460 * mode.
7461 *
7462 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007463 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007465static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007466encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007467 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007468 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007469{
Victor Stinner554f3f02010-06-16 23:33:54 +00007470 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 BOOL *pusedDefaultChar = &usedDefaultChar;
7472 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007473 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007474 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007475 const DWORD flags = encode_code_page_flags(code_page, NULL);
7476 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007477 /* Create a substring so that we can get the UTF-16 representation
7478 of just the slice under consideration. */
7479 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007480
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007482
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007484 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007486 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007487
Victor Stinner2fc507f2011-11-04 20:06:39 +01007488 substring = PyUnicode_Substring(unicode, offset, offset+len);
7489 if (substring == NULL)
7490 return -1;
7491 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7492 if (p == NULL) {
7493 Py_DECREF(substring);
7494 return -1;
7495 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007496 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007497
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007498 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007500 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 NULL, 0,
7502 NULL, pusedDefaultChar);
7503 if (outsize <= 0)
7504 goto error;
7505 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007506 if (pusedDefaultChar && *pusedDefaultChar) {
7507 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007508 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007509 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007510
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007512 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007513 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007514 if (*outbytes == NULL) {
7515 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007516 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007517 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007519 }
7520 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007521 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 const Py_ssize_t n = PyBytes_Size(*outbytes);
7523 if (outsize > PY_SSIZE_T_MAX - n) {
7524 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007525 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007526 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007527 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007528 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7529 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007531 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007533 }
7534
7535 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007536 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007537 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007538 out, outsize,
7539 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007540 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 if (outsize <= 0)
7542 goto error;
7543 if (pusedDefaultChar && *pusedDefaultChar)
7544 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007545 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007546
Victor Stinner3a50e702011-10-18 21:21:00 +02007547error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007548 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007549 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7550 return -2;
7551 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007552 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007553}
7554
Victor Stinner3a50e702011-10-18 21:21:00 +02007555/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007556 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007557 * error handler.
7558 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007559 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007560 * -1 on other error.
7561 */
7562static int
7563encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007564 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007565 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007566{
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007568 Py_ssize_t pos = unicode_offset;
7569 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007570 /* Ideally, we should get reason from FormatMessage. This is the Windows
7571 2000 English version of the message. */
7572 const char *reason = "invalid character";
7573 /* 4=maximum length of a UTF-8 sequence */
7574 char buffer[4];
7575 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7576 Py_ssize_t outsize;
7577 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 PyObject *errorHandler = NULL;
7579 PyObject *exc = NULL;
7580 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007581 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007582 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007583 PyObject *rep;
7584 int ret = -1;
7585
7586 assert(insize > 0);
7587
7588 encoding = code_page_name(code_page, &encoding_obj);
7589 if (encoding == NULL)
7590 return -1;
7591
7592 if (errors == NULL || strcmp(errors, "strict") == 0) {
7593 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7594 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007595 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007596 if (exc != NULL) {
7597 PyCodec_StrictErrors(exc);
7598 Py_DECREF(exc);
7599 }
7600 Py_XDECREF(encoding_obj);
7601 return -1;
7602 }
7603
7604 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7605 pusedDefaultChar = &usedDefaultChar;
7606 else
7607 pusedDefaultChar = NULL;
7608
7609 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7610 PyErr_NoMemory();
7611 goto error;
7612 }
7613 outsize = insize * Py_ARRAY_LENGTH(buffer);
7614
7615 if (*outbytes == NULL) {
7616 /* Create string object */
7617 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7618 if (*outbytes == NULL)
7619 goto error;
7620 out = PyBytes_AS_STRING(*outbytes);
7621 }
7622 else {
7623 /* Extend string object */
7624 Py_ssize_t n = PyBytes_Size(*outbytes);
7625 if (n > PY_SSIZE_T_MAX - outsize) {
7626 PyErr_NoMemory();
7627 goto error;
7628 }
7629 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7630 goto error;
7631 out = PyBytes_AS_STRING(*outbytes) + n;
7632 }
7633
7634 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007635 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007637 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7638 wchar_t chars[2];
7639 int charsize;
7640 if (ch < 0x10000) {
7641 chars[0] = (wchar_t)ch;
7642 charsize = 1;
7643 }
7644 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007645 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7646 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007647 charsize = 2;
7648 }
7649
Victor Stinner3a50e702011-10-18 21:21:00 +02007650 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 buffer, Py_ARRAY_LENGTH(buffer),
7653 NULL, pusedDefaultChar);
7654 if (outsize > 0) {
7655 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7656 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007657 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007658 memcpy(out, buffer, outsize);
7659 out += outsize;
7660 continue;
7661 }
7662 }
7663 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7664 PyErr_SetFromWindowsErr(0);
7665 goto error;
7666 }
7667
Victor Stinner3a50e702011-10-18 21:21:00 +02007668 rep = unicode_encode_call_errorhandler(
7669 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007670 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007671 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007672 if (rep == NULL)
7673 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007674 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007675
7676 if (PyBytes_Check(rep)) {
7677 outsize = PyBytes_GET_SIZE(rep);
7678 if (outsize != 1) {
7679 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7680 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7681 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7682 Py_DECREF(rep);
7683 goto error;
7684 }
7685 out = PyBytes_AS_STRING(*outbytes) + offset;
7686 }
7687 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7688 out += outsize;
7689 }
7690 else {
7691 Py_ssize_t i;
7692 enum PyUnicode_Kind kind;
7693 void *data;
7694
Benjamin Petersonbac79492012-01-14 13:34:47 -05007695 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007696 Py_DECREF(rep);
7697 goto error;
7698 }
7699
7700 outsize = PyUnicode_GET_LENGTH(rep);
7701 if (outsize != 1) {
7702 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7703 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7704 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7705 Py_DECREF(rep);
7706 goto error;
7707 }
7708 out = PyBytes_AS_STRING(*outbytes) + offset;
7709 }
7710 kind = PyUnicode_KIND(rep);
7711 data = PyUnicode_DATA(rep);
7712 for (i=0; i < outsize; i++) {
7713 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7714 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007715 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007716 encoding, unicode,
7717 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007718 "unable to encode error handler result to ASCII");
7719 Py_DECREF(rep);
7720 goto error;
7721 }
7722 *out = (unsigned char)ch;
7723 out++;
7724 }
7725 }
7726 Py_DECREF(rep);
7727 }
7728 /* write a NUL byte */
7729 *out = 0;
7730 outsize = out - PyBytes_AS_STRING(*outbytes);
7731 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7732 if (_PyBytes_Resize(outbytes, outsize) < 0)
7733 goto error;
7734 ret = 0;
7735
7736error:
7737 Py_XDECREF(encoding_obj);
7738 Py_XDECREF(errorHandler);
7739 Py_XDECREF(exc);
7740 return ret;
7741}
7742
Victor Stinner3a50e702011-10-18 21:21:00 +02007743static PyObject *
7744encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007745 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 const char *errors)
7747{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007748 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007749 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007750 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007751 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007752
Victor Stinner29dacf22015-01-26 16:41:32 +01007753 if (!PyUnicode_Check(unicode)) {
7754 PyErr_BadArgument();
7755 return NULL;
7756 }
7757
Benjamin Petersonbac79492012-01-14 13:34:47 -05007758 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007759 return NULL;
7760 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007761
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 if (code_page < 0) {
7763 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7764 return NULL;
7765 }
7766
Martin v. Löwis3d325192011-11-04 18:23:06 +01007767 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007768 return PyBytes_FromStringAndSize(NULL, 0);
7769
Victor Stinner7581cef2011-11-03 22:32:33 +01007770 offset = 0;
7771 do
7772 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007773#ifdef NEED_RETRY
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07007774 if (len > DECODING_CHUNK_SIZE) {
7775 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007776 done = 0;
7777 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007778 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007779#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007780 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007781 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007782 done = 1;
7783 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007784
Victor Stinner76a31a62011-11-04 00:05:13 +01007785 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007786 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007787 errors);
7788 if (ret == -2)
7789 ret = encode_code_page_errors(code_page, &outbytes,
7790 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007791 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007792 if (ret < 0) {
7793 Py_XDECREF(outbytes);
7794 return NULL;
7795 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007796
Victor Stinner7581cef2011-11-03 22:32:33 +01007797 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007798 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007799 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007800
Victor Stinner3a50e702011-10-18 21:21:00 +02007801 return outbytes;
7802}
7803
7804PyObject *
7805PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7806 Py_ssize_t size,
7807 const char *errors)
7808{
Victor Stinner7581cef2011-11-03 22:32:33 +01007809 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007810 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007811 if (unicode == NULL)
7812 return NULL;
7813 res = encode_code_page(CP_ACP, unicode, errors);
7814 Py_DECREF(unicode);
7815 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007816}
7817
7818PyObject *
7819PyUnicode_EncodeCodePage(int code_page,
7820 PyObject *unicode,
7821 const char *errors)
7822{
Victor Stinner7581cef2011-11-03 22:32:33 +01007823 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007824}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007825
Alexander Belopolsky40018472011-02-26 01:02:56 +00007826PyObject *
7827PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007828{
Victor Stinner7581cef2011-11-03 22:32:33 +01007829 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007830}
7831
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007832#undef NEED_RETRY
7833
Steve Dowercc16be82016-09-08 10:35:16 -07007834#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007835
Guido van Rossumd57fd912000-03-10 22:53:23 +00007836/* --- Character Mapping Codec -------------------------------------------- */
7837
Victor Stinnerfb161b12013-04-18 01:44:27 +02007838static int
7839charmap_decode_string(const char *s,
7840 Py_ssize_t size,
7841 PyObject *mapping,
7842 const char *errors,
7843 _PyUnicodeWriter *writer)
7844{
7845 const char *starts = s;
7846 const char *e;
7847 Py_ssize_t startinpos, endinpos;
7848 PyObject *errorHandler = NULL, *exc = NULL;
7849 Py_ssize_t maplen;
7850 enum PyUnicode_Kind mapkind;
7851 void *mapdata;
7852 Py_UCS4 x;
7853 unsigned char ch;
7854
7855 if (PyUnicode_READY(mapping) == -1)
7856 return -1;
7857
7858 maplen = PyUnicode_GET_LENGTH(mapping);
7859 mapdata = PyUnicode_DATA(mapping);
7860 mapkind = PyUnicode_KIND(mapping);
7861
7862 e = s + size;
7863
7864 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7865 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7866 * is disabled in encoding aliases, latin1 is preferred because
7867 * its implementation is faster. */
7868 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7869 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7870 Py_UCS4 maxchar = writer->maxchar;
7871
7872 assert (writer->kind == PyUnicode_1BYTE_KIND);
7873 while (s < e) {
7874 ch = *s;
7875 x = mapdata_ucs1[ch];
7876 if (x > maxchar) {
7877 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7878 goto onError;
7879 maxchar = writer->maxchar;
7880 outdata = (Py_UCS1 *)writer->data;
7881 }
7882 outdata[writer->pos] = x;
7883 writer->pos++;
7884 ++s;
7885 }
7886 return 0;
7887 }
7888
7889 while (s < e) {
7890 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7891 enum PyUnicode_Kind outkind = writer->kind;
7892 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7893 if (outkind == PyUnicode_1BYTE_KIND) {
7894 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7895 Py_UCS4 maxchar = writer->maxchar;
7896 while (s < e) {
7897 ch = *s;
7898 x = mapdata_ucs2[ch];
7899 if (x > maxchar)
7900 goto Error;
7901 outdata[writer->pos] = x;
7902 writer->pos++;
7903 ++s;
7904 }
7905 break;
7906 }
7907 else if (outkind == PyUnicode_2BYTE_KIND) {
7908 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7909 while (s < e) {
7910 ch = *s;
7911 x = mapdata_ucs2[ch];
7912 if (x == 0xFFFE)
7913 goto Error;
7914 outdata[writer->pos] = x;
7915 writer->pos++;
7916 ++s;
7917 }
7918 break;
7919 }
7920 }
7921 ch = *s;
7922
7923 if (ch < maplen)
7924 x = PyUnicode_READ(mapkind, mapdata, ch);
7925 else
7926 x = 0xfffe; /* invalid value */
7927Error:
7928 if (x == 0xfffe)
7929 {
7930 /* undefined mapping */
7931 startinpos = s-starts;
7932 endinpos = startinpos+1;
7933 if (unicode_decode_call_errorhandler_writer(
7934 errors, &errorHandler,
7935 "charmap", "character maps to <undefined>",
7936 &starts, &e, &startinpos, &endinpos, &exc, &s,
7937 writer)) {
7938 goto onError;
7939 }
7940 continue;
7941 }
7942
7943 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7944 goto onError;
7945 ++s;
7946 }
7947 Py_XDECREF(errorHandler);
7948 Py_XDECREF(exc);
7949 return 0;
7950
7951onError:
7952 Py_XDECREF(errorHandler);
7953 Py_XDECREF(exc);
7954 return -1;
7955}
7956
7957static int
7958charmap_decode_mapping(const char *s,
7959 Py_ssize_t size,
7960 PyObject *mapping,
7961 const char *errors,
7962 _PyUnicodeWriter *writer)
7963{
7964 const char *starts = s;
7965 const char *e;
7966 Py_ssize_t startinpos, endinpos;
7967 PyObject *errorHandler = NULL, *exc = NULL;
7968 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007969 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007970
7971 e = s + size;
7972
7973 while (s < e) {
7974 ch = *s;
7975
7976 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7977 key = PyLong_FromLong((long)ch);
7978 if (key == NULL)
7979 goto onError;
7980
7981 item = PyObject_GetItem(mapping, key);
7982 Py_DECREF(key);
7983 if (item == NULL) {
7984 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7985 /* No mapping found means: mapping is undefined. */
7986 PyErr_Clear();
7987 goto Undefined;
7988 } else
7989 goto onError;
7990 }
7991
7992 /* Apply mapping */
7993 if (item == Py_None)
7994 goto Undefined;
7995 if (PyLong_Check(item)) {
7996 long value = PyLong_AS_LONG(item);
7997 if (value == 0xFFFE)
7998 goto Undefined;
7999 if (value < 0 || value > MAX_UNICODE) {
8000 PyErr_Format(PyExc_TypeError,
8001 "character mapping must be in range(0x%lx)",
8002 (unsigned long)MAX_UNICODE + 1);
8003 goto onError;
8004 }
8005
8006 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8007 goto onError;
8008 }
8009 else if (PyUnicode_Check(item)) {
8010 if (PyUnicode_READY(item) == -1)
8011 goto onError;
8012 if (PyUnicode_GET_LENGTH(item) == 1) {
8013 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8014 if (value == 0xFFFE)
8015 goto Undefined;
8016 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8017 goto onError;
8018 }
8019 else {
8020 writer->overallocate = 1;
8021 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8022 goto onError;
8023 }
8024 }
8025 else {
8026 /* wrong return value */
8027 PyErr_SetString(PyExc_TypeError,
8028 "character mapping must return integer, None or str");
8029 goto onError;
8030 }
8031 Py_CLEAR(item);
8032 ++s;
8033 continue;
8034
8035Undefined:
8036 /* undefined mapping */
8037 Py_CLEAR(item);
8038 startinpos = s-starts;
8039 endinpos = startinpos+1;
8040 if (unicode_decode_call_errorhandler_writer(
8041 errors, &errorHandler,
8042 "charmap", "character maps to <undefined>",
8043 &starts, &e, &startinpos, &endinpos, &exc, &s,
8044 writer)) {
8045 goto onError;
8046 }
8047 }
8048 Py_XDECREF(errorHandler);
8049 Py_XDECREF(exc);
8050 return 0;
8051
8052onError:
8053 Py_XDECREF(item);
8054 Py_XDECREF(errorHandler);
8055 Py_XDECREF(exc);
8056 return -1;
8057}
8058
Alexander Belopolsky40018472011-02-26 01:02:56 +00008059PyObject *
8060PyUnicode_DecodeCharmap(const char *s,
8061 Py_ssize_t size,
8062 PyObject *mapping,
8063 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008065 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008066
Guido van Rossumd57fd912000-03-10 22:53:23 +00008067 /* Default to Latin-1 */
8068 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008069 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008072 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008073 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008074 writer.min_length = size;
8075 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008077
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008078 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008079 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8080 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008081 }
8082 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008083 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8084 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008085 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008086 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008087
Benjamin Peterson29060642009-01-31 22:14:21 +00008088 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008089 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008090 return NULL;
8091}
8092
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008093/* Charmap encoding: the lookup table */
8094
Alexander Belopolsky40018472011-02-26 01:02:56 +00008095struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008096 PyObject_HEAD
8097 unsigned char level1[32];
8098 int count2, count3;
8099 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100};
8101
8102static PyObject*
8103encoding_map_size(PyObject *obj, PyObject* args)
8104{
8105 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008106 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008107 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108}
8109
8110static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008111 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 PyDoc_STR("Return the size (in bytes) of this object") },
8113 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008114};
8115
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008116static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008117 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 "EncodingMap", /*tp_name*/
8119 sizeof(struct encoding_map), /*tp_basicsize*/
8120 0, /*tp_itemsize*/
8121 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008122 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008123 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008124 0, /*tp_getattr*/
8125 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008126 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008127 0, /*tp_repr*/
8128 0, /*tp_as_number*/
8129 0, /*tp_as_sequence*/
8130 0, /*tp_as_mapping*/
8131 0, /*tp_hash*/
8132 0, /*tp_call*/
8133 0, /*tp_str*/
8134 0, /*tp_getattro*/
8135 0, /*tp_setattro*/
8136 0, /*tp_as_buffer*/
8137 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8138 0, /*tp_doc*/
8139 0, /*tp_traverse*/
8140 0, /*tp_clear*/
8141 0, /*tp_richcompare*/
8142 0, /*tp_weaklistoffset*/
8143 0, /*tp_iter*/
8144 0, /*tp_iternext*/
8145 encoding_map_methods, /*tp_methods*/
8146 0, /*tp_members*/
8147 0, /*tp_getset*/
8148 0, /*tp_base*/
8149 0, /*tp_dict*/
8150 0, /*tp_descr_get*/
8151 0, /*tp_descr_set*/
8152 0, /*tp_dictoffset*/
8153 0, /*tp_init*/
8154 0, /*tp_alloc*/
8155 0, /*tp_new*/
8156 0, /*tp_free*/
8157 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008158};
8159
8160PyObject*
8161PyUnicode_BuildEncodingMap(PyObject* string)
8162{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008163 PyObject *result;
8164 struct encoding_map *mresult;
8165 int i;
8166 int need_dict = 0;
8167 unsigned char level1[32];
8168 unsigned char level2[512];
8169 unsigned char *mlevel1, *mlevel2, *mlevel3;
8170 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008171 int kind;
8172 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008173 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008174 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008175
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008176 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008177 PyErr_BadArgument();
8178 return NULL;
8179 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 kind = PyUnicode_KIND(string);
8181 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008182 length = PyUnicode_GET_LENGTH(string);
8183 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008184 memset(level1, 0xFF, sizeof level1);
8185 memset(level2, 0xFF, sizeof level2);
8186
8187 /* If there isn't a one-to-one mapping of NULL to \0,
8188 or if there are non-BMP characters, we need to use
8189 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008190 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008191 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008192 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008193 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008194 ch = PyUnicode_READ(kind, data, i);
8195 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196 need_dict = 1;
8197 break;
8198 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008199 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 /* unmapped character */
8201 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008202 l1 = ch >> 11;
8203 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008204 if (level1[l1] == 0xFF)
8205 level1[l1] = count2++;
8206 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008207 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008208 }
8209
8210 if (count2 >= 0xFF || count3 >= 0xFF)
8211 need_dict = 1;
8212
8213 if (need_dict) {
8214 PyObject *result = PyDict_New();
8215 PyObject *key, *value;
8216 if (!result)
8217 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008218 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008219 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008220 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008221 if (!key || !value)
8222 goto failed1;
8223 if (PyDict_SetItem(result, key, value) == -1)
8224 goto failed1;
8225 Py_DECREF(key);
8226 Py_DECREF(value);
8227 }
8228 return result;
8229 failed1:
8230 Py_XDECREF(key);
8231 Py_XDECREF(value);
8232 Py_DECREF(result);
8233 return NULL;
8234 }
8235
8236 /* Create a three-level trie */
8237 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8238 16*count2 + 128*count3 - 1);
8239 if (!result)
8240 return PyErr_NoMemory();
8241 PyObject_Init(result, &EncodingMapType);
8242 mresult = (struct encoding_map*)result;
8243 mresult->count2 = count2;
8244 mresult->count3 = count3;
8245 mlevel1 = mresult->level1;
8246 mlevel2 = mresult->level23;
8247 mlevel3 = mresult->level23 + 16*count2;
8248 memcpy(mlevel1, level1, 32);
8249 memset(mlevel2, 0xFF, 16*count2);
8250 memset(mlevel3, 0, 128*count3);
8251 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008252 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008253 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008254 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8255 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008256 /* unmapped character */
8257 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008258 o1 = ch>>11;
8259 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008260 i2 = 16*mlevel1[o1] + o2;
8261 if (mlevel2[i2] == 0xFF)
8262 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008263 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264 i3 = 128*mlevel2[i2] + o3;
8265 mlevel3[i3] = i;
8266 }
8267 return result;
8268}
8269
8270static int
Victor Stinner22168992011-11-20 17:09:18 +01008271encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008272{
8273 struct encoding_map *map = (struct encoding_map*)mapping;
8274 int l1 = c>>11;
8275 int l2 = (c>>7) & 0xF;
8276 int l3 = c & 0x7F;
8277 int i;
8278
Victor Stinner22168992011-11-20 17:09:18 +01008279 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008280 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008281 if (c == 0)
8282 return 0;
8283 /* level 1*/
8284 i = map->level1[l1];
8285 if (i == 0xFF) {
8286 return -1;
8287 }
8288 /* level 2*/
8289 i = map->level23[16*i+l2];
8290 if (i == 0xFF) {
8291 return -1;
8292 }
8293 /* level 3 */
8294 i = map->level23[16*map->count2 + 128*i + l3];
8295 if (i == 0) {
8296 return -1;
8297 }
8298 return i;
8299}
8300
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008301/* Lookup the character ch in the mapping. If the character
8302 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008303 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008304static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008305charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008306{
Christian Heimes217cfd12007-12-02 14:31:20 +00008307 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008308 PyObject *x;
8309
8310 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008312 x = PyObject_GetItem(mapping, w);
8313 Py_DECREF(w);
8314 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8316 /* No mapping found means: mapping is undefined. */
8317 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008318 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 } else
8320 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008321 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008322 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008323 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008324 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 long value = PyLong_AS_LONG(x);
8326 if (value < 0 || value > 255) {
8327 PyErr_SetString(PyExc_TypeError,
8328 "character mapping must be in range(256)");
8329 Py_DECREF(x);
8330 return NULL;
8331 }
8332 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008334 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008335 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008336 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008337 /* wrong return value */
8338 PyErr_Format(PyExc_TypeError,
8339 "character mapping must return integer, bytes or None, not %.400s",
8340 x->ob_type->tp_name);
8341 Py_DECREF(x);
8342 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008343 }
8344}
8345
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008346static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008347charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008349 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8350 /* exponentially overallocate to minimize reallocations */
8351 if (requiredsize < 2*outsize)
8352 requiredsize = 2*outsize;
8353 if (_PyBytes_Resize(outobj, requiredsize))
8354 return -1;
8355 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008356}
8357
Benjamin Peterson14339b62009-01-31 16:36:08 +00008358typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008359 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008360} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008361/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008362 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008363 space is available. Return a new reference to the object that
8364 was put in the output buffer, or Py_None, if the mapping was undefined
8365 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008366 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008367static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008368charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008369 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008370{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008371 PyObject *rep;
8372 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008373 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008374
Christian Heimes90aa7642007-12-19 02:45:37 +00008375 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008377 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008378 if (res == -1)
8379 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 if (outsize<requiredsize)
8381 if (charmapencode_resize(outobj, outpos, requiredsize))
8382 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008383 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 outstart[(*outpos)++] = (char)res;
8385 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008386 }
8387
8388 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008389 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008391 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008392 Py_DECREF(rep);
8393 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008394 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 if (PyLong_Check(rep)) {
8396 Py_ssize_t requiredsize = *outpos+1;
8397 if (outsize<requiredsize)
8398 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8399 Py_DECREF(rep);
8400 return enc_EXCEPTION;
8401 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008402 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008403 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008404 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 else {
8406 const char *repchars = PyBytes_AS_STRING(rep);
8407 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8408 Py_ssize_t requiredsize = *outpos+repsize;
8409 if (outsize<requiredsize)
8410 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8411 Py_DECREF(rep);
8412 return enc_EXCEPTION;
8413 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008414 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008415 memcpy(outstart + *outpos, repchars, repsize);
8416 *outpos += repsize;
8417 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008419 Py_DECREF(rep);
8420 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421}
8422
8423/* handle an error in PyUnicode_EncodeCharmap
8424 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008425static int
8426charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008427 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008429 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008430 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008431{
8432 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008433 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008434 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008435 enum PyUnicode_Kind kind;
8436 void *data;
8437 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008438 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008439 Py_ssize_t collstartpos = *inpos;
8440 Py_ssize_t collendpos = *inpos+1;
8441 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008442 const char *encoding = "charmap";
8443 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008444 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008445 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008446 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008447
Benjamin Petersonbac79492012-01-14 13:34:47 -05008448 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008449 return -1;
8450 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008451 /* find all unencodable characters */
8452 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008453 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008454 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008455 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008456 val = encoding_map_lookup(ch, mapping);
8457 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008458 break;
8459 ++collendpos;
8460 continue;
8461 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008462
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008463 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8464 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008465 if (rep==NULL)
8466 return -1;
8467 else if (rep!=Py_None) {
8468 Py_DECREF(rep);
8469 break;
8470 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008471 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008472 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 }
8474 /* cache callback name lookup
8475 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008476 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008477 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008478
8479 switch (*error_handler) {
8480 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008481 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008482 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008483
8484 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008485 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 x = charmapencode_output('?', mapping, res, respos);
8487 if (x==enc_EXCEPTION) {
8488 return -1;
8489 }
8490 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008491 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008492 return -1;
8493 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008494 }
8495 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008496 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008497 *inpos = collendpos;
8498 break;
Victor Stinner50149202015-09-22 00:26:54 +02008499
8500 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008501 /* generate replacement (temporarily (mis)uses p) */
8502 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008503 char buffer[2+29+1+1];
8504 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008505 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 for (cp = buffer; *cp; ++cp) {
8507 x = charmapencode_output(*cp, mapping, res, respos);
8508 if (x==enc_EXCEPTION)
8509 return -1;
8510 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008511 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 return -1;
8513 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 }
8515 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008516 *inpos = collendpos;
8517 break;
Victor Stinner50149202015-09-22 00:26:54 +02008518
Benjamin Peterson14339b62009-01-31 16:36:08 +00008519 default:
Victor Stinner50149202015-09-22 00:26:54 +02008520 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008521 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008522 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008523 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008524 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008525 if (PyBytes_Check(repunicode)) {
8526 /* Directly copy bytes result to output. */
8527 Py_ssize_t outsize = PyBytes_Size(*res);
8528 Py_ssize_t requiredsize;
8529 repsize = PyBytes_Size(repunicode);
8530 requiredsize = *respos + repsize;
8531 if (requiredsize > outsize)
8532 /* Make room for all additional bytes. */
8533 if (charmapencode_resize(res, respos, requiredsize)) {
8534 Py_DECREF(repunicode);
8535 return -1;
8536 }
8537 memcpy(PyBytes_AsString(*res) + *respos,
8538 PyBytes_AsString(repunicode), repsize);
8539 *respos += repsize;
8540 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008541 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008542 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008543 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008544 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008545 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008546 Py_DECREF(repunicode);
8547 return -1;
8548 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008549 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008550 data = PyUnicode_DATA(repunicode);
8551 kind = PyUnicode_KIND(repunicode);
8552 for (index = 0; index < repsize; index++) {
8553 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8554 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008555 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008556 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008557 return -1;
8558 }
8559 else if (x==enc_FAILED) {
8560 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008561 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008562 return -1;
8563 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008564 }
8565 *inpos = newpos;
8566 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008567 }
8568 return 0;
8569}
8570
Alexander Belopolsky40018472011-02-26 01:02:56 +00008571PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572_PyUnicode_EncodeCharmap(PyObject *unicode,
8573 PyObject *mapping,
8574 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008575{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 /* output object */
8577 PyObject *res = NULL;
8578 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008579 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008581 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008582 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008583 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008584 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008585 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008586 void *data;
8587 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Benjamin Petersonbac79492012-01-14 13:34:47 -05008589 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008590 return NULL;
8591 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008592 data = PyUnicode_DATA(unicode);
8593 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008594
Guido van Rossumd57fd912000-03-10 22:53:23 +00008595 /* Default to Latin-1 */
8596 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008597 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 /* allocate enough for a simple encoding without
8600 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008601 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008602 if (res == NULL)
8603 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008604 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008606
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008607 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008608 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008610 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 if (x==enc_EXCEPTION) /* error */
8612 goto onError;
8613 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008614 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008616 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008617 &res, &respos)) {
8618 goto onError;
8619 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008620 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008621 else
8622 /* done with this character => adjust input position */
8623 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008624 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008625
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008627 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008628 if (_PyBytes_Resize(&res, respos) < 0)
8629 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008630
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008631 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008632 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008633 return res;
8634
Benjamin Peterson29060642009-01-31 22:14:21 +00008635 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008636 Py_XDECREF(res);
8637 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008638 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008639 return NULL;
8640}
8641
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008642/* Deprecated */
8643PyObject *
8644PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8645 Py_ssize_t size,
8646 PyObject *mapping,
8647 const char *errors)
8648{
8649 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008650 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008651 if (unicode == NULL)
8652 return NULL;
8653 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8654 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008655 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008656}
8657
Alexander Belopolsky40018472011-02-26 01:02:56 +00008658PyObject *
8659PyUnicode_AsCharmapString(PyObject *unicode,
8660 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661{
8662 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008663 PyErr_BadArgument();
8664 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008666 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667}
8668
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008669/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008670static void
8671make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008672 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008673 Py_ssize_t startpos, Py_ssize_t endpos,
8674 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008675{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008676 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008677 *exceptionObject = _PyUnicodeTranslateError_Create(
8678 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008679 }
8680 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008681 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8682 goto onError;
8683 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8684 goto onError;
8685 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8686 goto onError;
8687 return;
8688 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008689 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008690 }
8691}
8692
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008693/* error handling callback helper:
8694 build arguments, call the callback and check the arguments,
8695 put the result into newpos and return the replacement string, which
8696 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008697static PyObject *
8698unicode_translate_call_errorhandler(const char *errors,
8699 PyObject **errorHandler,
8700 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008701 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008702 Py_ssize_t startpos, Py_ssize_t endpos,
8703 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008704{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008705 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008706
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008707 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008708 PyObject *restuple;
8709 PyObject *resunicode;
8710
8711 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715 }
8716
8717 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008718 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008719 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008721
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008722 restuple = PyObject_CallFunctionObjArgs(
8723 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008726 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008727 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008728 Py_DECREF(restuple);
8729 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008730 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008731 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 &resunicode, &i_newpos)) {
8733 Py_DECREF(restuple);
8734 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008736 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008737 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008738 else
8739 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008740 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008741 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008742 Py_DECREF(restuple);
8743 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008744 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745 Py_INCREF(resunicode);
8746 Py_DECREF(restuple);
8747 return resunicode;
8748}
8749
8750/* Lookup the character ch in the mapping and put the result in result,
8751 which must be decrefed by the caller.
8752 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008753static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008754charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755{
Christian Heimes217cfd12007-12-02 14:31:20 +00008756 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008757 PyObject *x;
8758
8759 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008760 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761 x = PyObject_GetItem(mapping, w);
8762 Py_DECREF(w);
8763 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008764 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8765 /* No mapping found means: use 1:1 mapping. */
8766 PyErr_Clear();
8767 *result = NULL;
8768 return 0;
8769 } else
8770 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008771 }
8772 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 *result = x;
8774 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008775 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008776 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008777 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008778 if (value < 0 || value > MAX_UNICODE) {
8779 PyErr_Format(PyExc_ValueError,
8780 "character mapping must be in range(0x%x)",
8781 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008782 Py_DECREF(x);
8783 return -1;
8784 }
8785 *result = x;
8786 return 0;
8787 }
8788 else if (PyUnicode_Check(x)) {
8789 *result = x;
8790 return 0;
8791 }
8792 else {
8793 /* wrong return value */
8794 PyErr_SetString(PyExc_TypeError,
8795 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008796 Py_DECREF(x);
8797 return -1;
8798 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008799}
Victor Stinner1194ea02014-04-04 19:37:40 +02008800
8801/* lookup the character, write the result into the writer.
8802 Return 1 if the result was written into the writer, return 0 if the mapping
8803 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008804static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008805charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8806 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008807{
Victor Stinner1194ea02014-04-04 19:37:40 +02008808 PyObject *item;
8809
8810 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008812
8813 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008814 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008815 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008816 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008817 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008818 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008819 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008820
8821 if (item == Py_None) {
8822 Py_DECREF(item);
8823 return 0;
8824 }
8825
8826 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008827 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8828 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8829 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008830 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8831 Py_DECREF(item);
8832 return -1;
8833 }
8834 Py_DECREF(item);
8835 return 1;
8836 }
8837
8838 if (!PyUnicode_Check(item)) {
8839 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008840 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008841 }
8842
8843 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8844 Py_DECREF(item);
8845 return -1;
8846 }
8847
8848 Py_DECREF(item);
8849 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008850}
8851
Victor Stinner89a76ab2014-04-05 11:44:04 +02008852static int
8853unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8854 Py_UCS1 *translate)
8855{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008856 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008857 int ret = 0;
8858
Victor Stinner89a76ab2014-04-05 11:44:04 +02008859 if (charmaptranslate_lookup(ch, mapping, &item)) {
8860 return -1;
8861 }
8862
8863 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008864 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008865 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008866 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008867 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008868 /* not found => default to 1:1 mapping */
8869 translate[ch] = ch;
8870 return 1;
8871 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008872 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008873 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008874 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8875 used it */
8876 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008877 /* invalid character or character outside ASCII:
8878 skip the fast translate */
8879 goto exit;
8880 }
8881 translate[ch] = (Py_UCS1)replace;
8882 }
8883 else if (PyUnicode_Check(item)) {
8884 Py_UCS4 replace;
8885
8886 if (PyUnicode_READY(item) == -1) {
8887 Py_DECREF(item);
8888 return -1;
8889 }
8890 if (PyUnicode_GET_LENGTH(item) != 1)
8891 goto exit;
8892
8893 replace = PyUnicode_READ_CHAR(item, 0);
8894 if (replace > 127)
8895 goto exit;
8896 translate[ch] = (Py_UCS1)replace;
8897 }
8898 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008899 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 goto exit;
8901 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008902 ret = 1;
8903
Benjamin Peterson1365de72014-04-07 20:15:41 -04008904 exit:
8905 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906 return ret;
8907}
8908
8909/* Fast path for ascii => ascii translation. Return 1 if the whole string
8910 was translated into writer, return 0 if the input string was partially
8911 translated into writer, raise an exception and return -1 on error. */
8912static int
8913unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008914 _PyUnicodeWriter *writer, int ignore,
8915 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916{
Victor Stinner872b2912014-04-05 14:27:07 +02008917 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008918 Py_ssize_t len;
8919 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008920 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008921
Victor Stinner89a76ab2014-04-05 11:44:04 +02008922 len = PyUnicode_GET_LENGTH(input);
8923
Victor Stinner872b2912014-04-05 14:27:07 +02008924 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008925
8926 in = PyUnicode_1BYTE_DATA(input);
8927 end = in + len;
8928
8929 assert(PyUnicode_IS_ASCII(writer->buffer));
8930 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8931 out = PyUnicode_1BYTE_DATA(writer->buffer);
8932
Victor Stinner872b2912014-04-05 14:27:07 +02008933 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008935 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008936 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008937 int translate = unicode_fast_translate_lookup(mapping, ch,
8938 ascii_table);
8939 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008940 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008941 if (translate == 0)
8942 goto exit;
8943 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008944 }
Victor Stinner872b2912014-04-05 14:27:07 +02008945 if (ch2 == 0xfe) {
8946 if (ignore)
8947 continue;
8948 goto exit;
8949 }
8950 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008951 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008952 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008953 }
Victor Stinner872b2912014-04-05 14:27:07 +02008954 res = 1;
8955
8956exit:
8957 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008958 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008959 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008960}
8961
Victor Stinner3222da22015-10-01 22:07:32 +02008962static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963_PyUnicode_TranslateCharmap(PyObject *input,
8964 PyObject *mapping,
8965 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008966{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008967 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008968 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008969 Py_ssize_t size, i;
8970 int kind;
8971 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008972 _PyUnicodeWriter writer;
8973 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008974 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008975 PyObject *errorHandler = NULL;
8976 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008977 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008978 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008979
Guido van Rossumd57fd912000-03-10 22:53:23 +00008980 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008981 PyErr_BadArgument();
8982 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008983 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008984
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985 if (PyUnicode_READY(input) == -1)
8986 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008987 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008988 kind = PyUnicode_KIND(input);
8989 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008990
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008991 if (size == 0)
8992 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008993
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008994 /* allocate enough for a simple 1:1 translation without
8995 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008996 _PyUnicodeWriter_Init(&writer);
8997 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008998 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008999
Victor Stinner872b2912014-04-05 14:27:07 +02009000 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9001
Victor Stinner33798672016-03-01 21:59:58 +01009002 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009003 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009004 if (PyUnicode_IS_ASCII(input)) {
9005 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9006 if (res < 0) {
9007 _PyUnicodeWriter_Dealloc(&writer);
9008 return NULL;
9009 }
9010 if (res == 1)
9011 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009012 }
Victor Stinner33798672016-03-01 21:59:58 +01009013 else {
9014 i = 0;
9015 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009016
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009017 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009018 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009019 int translate;
9020 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9021 Py_ssize_t newpos;
9022 /* startpos for collecting untranslatable chars */
9023 Py_ssize_t collstart;
9024 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009025 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009026
Victor Stinner1194ea02014-04-04 19:37:40 +02009027 ch = PyUnicode_READ(kind, data, i);
9028 translate = charmaptranslate_output(ch, mapping, &writer);
9029 if (translate < 0)
9030 goto onError;
9031
9032 if (translate != 0) {
9033 /* it worked => adjust input pointer */
9034 ++i;
9035 continue;
9036 }
9037
9038 /* untranslatable character */
9039 collstart = i;
9040 collend = i+1;
9041
9042 /* find all untranslatable characters */
9043 while (collend < size) {
9044 PyObject *x;
9045 ch = PyUnicode_READ(kind, data, collend);
9046 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009047 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009048 Py_XDECREF(x);
9049 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009050 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009051 ++collend;
9052 }
9053
9054 if (ignore) {
9055 i = collend;
9056 }
9057 else {
9058 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9059 reason, input, &exc,
9060 collstart, collend, &newpos);
9061 if (repunicode == NULL)
9062 goto onError;
9063 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009064 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009065 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009066 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009067 Py_DECREF(repunicode);
9068 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009069 }
9070 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009071 Py_XDECREF(exc);
9072 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009073 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009074
Benjamin Peterson29060642009-01-31 22:14:21 +00009075 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009076 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009077 Py_XDECREF(exc);
9078 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009079 return NULL;
9080}
9081
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009082/* Deprecated. Use PyUnicode_Translate instead. */
9083PyObject *
9084PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9085 Py_ssize_t size,
9086 PyObject *mapping,
9087 const char *errors)
9088{
Christian Heimes5f520f42012-09-11 14:03:25 +02009089 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009090 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009091 if (!unicode)
9092 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009093 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9094 Py_DECREF(unicode);
9095 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009096}
9097
Alexander Belopolsky40018472011-02-26 01:02:56 +00009098PyObject *
9099PyUnicode_Translate(PyObject *str,
9100 PyObject *mapping,
9101 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009102{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009103 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009104 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009105 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009106}
Tim Petersced69f82003-09-16 20:30:58 +00009107
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108PyObject *
9109_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9110{
9111 if (!PyUnicode_Check(unicode)) {
9112 PyErr_BadInternalCall();
9113 return NULL;
9114 }
9115 if (PyUnicode_READY(unicode) == -1)
9116 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009117 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009118 /* If the string is already ASCII, just return the same string */
9119 Py_INCREF(unicode);
9120 return unicode;
9121 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009122
9123 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9124 PyObject *result = PyUnicode_New(len, 127);
9125 if (result == NULL) {
9126 return NULL;
9127 }
9128
9129 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9130 int kind = PyUnicode_KIND(unicode);
9131 const void *data = PyUnicode_DATA(unicode);
9132 Py_ssize_t i;
9133 for (i = 0; i < len; ++i) {
9134 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9135 if (ch < 127) {
9136 out[i] = ch;
9137 }
9138 else if (Py_UNICODE_ISSPACE(ch)) {
9139 out[i] = ' ';
9140 }
9141 else {
9142 int decimal = Py_UNICODE_TODECIMAL(ch);
9143 if (decimal < 0) {
9144 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009145 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009146 _PyUnicode_LENGTH(result) = i + 1;
9147 break;
9148 }
9149 out[i] = '0' + decimal;
9150 }
9151 }
9152
INADA Naoki16dfca42018-07-14 12:06:43 +09009153 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009154 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009155}
9156
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009157PyObject *
9158PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9159 Py_ssize_t length)
9160{
Victor Stinnerf0124502011-11-21 23:12:56 +01009161 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009162 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009163 Py_UCS4 maxchar;
9164 enum PyUnicode_Kind kind;
9165 void *data;
9166
Victor Stinner99d7ad02012-02-22 13:37:39 +01009167 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009168 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009169 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009170 if (ch > 127) {
9171 int decimal = Py_UNICODE_TODECIMAL(ch);
9172 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009173 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009174 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009175 }
9176 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009177
9178 /* Copy to a new string */
9179 decimal = PyUnicode_New(length, maxchar);
9180 if (decimal == NULL)
9181 return decimal;
9182 kind = PyUnicode_KIND(decimal);
9183 data = PyUnicode_DATA(decimal);
9184 /* Iterate over code points */
9185 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009186 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009187 if (ch > 127) {
9188 int decimal = Py_UNICODE_TODECIMAL(ch);
9189 if (decimal >= 0)
9190 ch = '0' + decimal;
9191 }
9192 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009193 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009194 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009195}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009196/* --- Decimal Encoder ---------------------------------------------------- */
9197
Alexander Belopolsky40018472011-02-26 01:02:56 +00009198int
9199PyUnicode_EncodeDecimal(Py_UNICODE *s,
9200 Py_ssize_t length,
9201 char *output,
9202 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009203{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009204 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009205 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009206 enum PyUnicode_Kind kind;
9207 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009208
9209 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009210 PyErr_BadArgument();
9211 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009212 }
9213
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009214 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009215 if (unicode == NULL)
9216 return -1;
9217
Victor Stinner42bf7752011-11-21 22:52:58 +01009218 kind = PyUnicode_KIND(unicode);
9219 data = PyUnicode_DATA(unicode);
9220
Victor Stinnerb84d7232011-11-22 01:50:07 +01009221 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009222 PyObject *exc;
9223 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009225 Py_ssize_t startpos;
9226
9227 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009228
Benjamin Peterson29060642009-01-31 22:14:21 +00009229 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009230 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009231 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009232 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009233 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009234 decimal = Py_UNICODE_TODECIMAL(ch);
9235 if (decimal >= 0) {
9236 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009237 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009238 continue;
9239 }
9240 if (0 < ch && ch < 256) {
9241 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009242 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009243 continue;
9244 }
Victor Stinner6345be92011-11-25 20:09:01 +01009245
Victor Stinner42bf7752011-11-21 22:52:58 +01009246 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009247 exc = NULL;
9248 raise_encode_exception(&exc, "decimal", unicode,
9249 startpos, startpos+1,
9250 "invalid decimal Unicode string");
9251 Py_XDECREF(exc);
9252 Py_DECREF(unicode);
9253 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009254 }
9255 /* 0-terminate the output string */
9256 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009257 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009258 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009259}
9260
Guido van Rossumd57fd912000-03-10 22:53:23 +00009261/* --- Helpers ------------------------------------------------------------ */
9262
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009263/* helper macro to fixup start/end slice values */
9264#define ADJUST_INDICES(start, end, len) \
9265 if (end > len) \
9266 end = len; \
9267 else if (end < 0) { \
9268 end += len; \
9269 if (end < 0) \
9270 end = 0; \
9271 } \
9272 if (start < 0) { \
9273 start += len; \
9274 if (start < 0) \
9275 start = 0; \
9276 }
9277
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009278static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009279any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009280 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009281 Py_ssize_t end,
9282 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009284 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 void *buf1, *buf2;
9286 Py_ssize_t len1, len2, result;
9287
9288 kind1 = PyUnicode_KIND(s1);
9289 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009290 if (kind1 < kind2)
9291 return -1;
9292
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009293 len1 = PyUnicode_GET_LENGTH(s1);
9294 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009295 ADJUST_INDICES(start, end, len1);
9296 if (end - start < len2)
9297 return -1;
9298
9299 buf1 = PyUnicode_DATA(s1);
9300 buf2 = PyUnicode_DATA(s2);
9301 if (len2 == 1) {
9302 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9303 result = findchar((const char *)buf1 + kind1*start,
9304 kind1, end - start, ch, direction);
9305 if (result == -1)
9306 return -1;
9307 else
9308 return start + result;
9309 }
9310
9311 if (kind2 != kind1) {
9312 buf2 = _PyUnicode_AsKind(s2, kind1);
9313 if (!buf2)
9314 return -2;
9315 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009316
Victor Stinner794d5672011-10-10 03:21:36 +02009317 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009318 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009319 case PyUnicode_1BYTE_KIND:
9320 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9321 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9322 else
9323 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9324 break;
9325 case PyUnicode_2BYTE_KIND:
9326 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9327 break;
9328 case PyUnicode_4BYTE_KIND:
9329 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9330 break;
9331 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009332 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009333 }
9334 }
9335 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009336 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009337 case PyUnicode_1BYTE_KIND:
9338 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9339 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9340 else
9341 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9342 break;
9343 case PyUnicode_2BYTE_KIND:
9344 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9345 break;
9346 case PyUnicode_4BYTE_KIND:
9347 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9348 break;
9349 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009350 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009351 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009352 }
9353
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009354 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009355 PyMem_Free(buf2);
9356
9357 return result;
9358}
9359
Victor Stinner59423e32018-11-26 13:40:01 +01009360/* _PyUnicode_InsertThousandsGrouping() helper functions */
9361#include "stringlib/localeutil.h"
9362
9363/**
9364 * InsertThousandsGrouping:
9365 * @writer: Unicode writer.
9366 * @n_buffer: Number of characters in @buffer.
9367 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9368 * @d_pos: Start of digits string.
9369 * @n_digits: The number of digits in the string, in which we want
9370 * to put the grouping chars.
9371 * @min_width: The minimum width of the digits in the output string.
9372 * Output will be zero-padded on the left to fill.
9373 * @grouping: see definition in localeconv().
9374 * @thousands_sep: see definition in localeconv().
9375 *
9376 * There are 2 modes: counting and filling. If @writer is NULL,
9377 * we are in counting mode, else filling mode.
9378 * If counting, the required buffer size is returned.
9379 * If filling, we know the buffer will be large enough, so we don't
9380 * need to pass in the buffer size.
9381 * Inserts thousand grouping characters (as defined by grouping and
9382 * thousands_sep) into @writer.
9383 *
9384 * Return value: -1 on error, number of characters otherwise.
9385 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009386Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009387_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009388 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009389 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009390 PyObject *digits,
9391 Py_ssize_t d_pos,
9392 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009393 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009394 const char *grouping,
9395 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009396 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009397{
Xtreak3f7983a2019-01-07 20:39:14 +05309398 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009399 if (writer) {
9400 assert(digits != NULL);
9401 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009402 }
9403 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009404 assert(digits == NULL);
9405 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009406 }
Victor Stinner59423e32018-11-26 13:40:01 +01009407 assert(0 <= d_pos);
9408 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009409 assert(grouping != NULL);
9410
9411 if (digits != NULL) {
9412 if (PyUnicode_READY(digits) == -1) {
9413 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009414 }
Victor Stinner59423e32018-11-26 13:40:01 +01009415 }
9416 if (PyUnicode_READY(thousands_sep) == -1) {
9417 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009418 }
9419
Victor Stinner59423e32018-11-26 13:40:01 +01009420 Py_ssize_t count = 0;
9421 Py_ssize_t n_zeros;
9422 int loop_broken = 0;
9423 int use_separator = 0; /* First time through, don't append the
9424 separator. They only go between
9425 groups. */
9426 Py_ssize_t buffer_pos;
9427 Py_ssize_t digits_pos;
9428 Py_ssize_t len;
9429 Py_ssize_t n_chars;
9430 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9431 be looked at */
9432 /* A generator that returns all of the grouping widths, until it
9433 returns 0. */
9434 GroupGenerator groupgen;
9435 GroupGenerator_init(&groupgen, grouping);
9436 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9437
9438 /* if digits are not grouped, thousands separator
9439 should be an empty string */
9440 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9441
9442 digits_pos = d_pos + n_digits;
9443 if (writer) {
9444 buffer_pos = writer->pos + n_buffer;
9445 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9446 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009447 }
Victor Stinner59423e32018-11-26 13:40:01 +01009448 else {
9449 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009450 }
Victor Stinner59423e32018-11-26 13:40:01 +01009451
9452 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009453 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009454 }
Victor Stinner59423e32018-11-26 13:40:01 +01009455
9456 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9457 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9458 n_zeros = Py_MAX(0, len - remaining);
9459 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9460
9461 /* Use n_zero zero's and n_chars chars */
9462
9463 /* Count only, don't do anything. */
9464 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9465
9466 /* Copy into the writer. */
9467 InsertThousandsGrouping_fill(writer, &buffer_pos,
9468 digits, &digits_pos,
9469 n_chars, n_zeros,
9470 use_separator ? thousands_sep : NULL,
9471 thousands_sep_len, maxchar);
9472
9473 /* Use a separator next time. */
9474 use_separator = 1;
9475
9476 remaining -= n_chars;
9477 min_width -= len;
9478
9479 if (remaining <= 0 && min_width <= 0) {
9480 loop_broken = 1;
9481 break;
9482 }
9483 min_width -= thousands_sep_len;
9484 }
9485 if (!loop_broken) {
9486 /* We left the loop without using a break statement. */
9487
9488 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9489 n_zeros = Py_MAX(0, len - remaining);
9490 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9491
9492 /* Use n_zero zero's and n_chars chars */
9493 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9494
9495 /* Copy into the writer. */
9496 InsertThousandsGrouping_fill(writer, &buffer_pos,
9497 digits, &digits_pos,
9498 n_chars, n_zeros,
9499 use_separator ? thousands_sep : NULL,
9500 thousands_sep_len, maxchar);
9501 }
9502 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009503}
9504
9505
Alexander Belopolsky40018472011-02-26 01:02:56 +00009506Py_ssize_t
9507PyUnicode_Count(PyObject *str,
9508 PyObject *substr,
9509 Py_ssize_t start,
9510 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009511{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009512 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009513 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009514 void *buf1 = NULL, *buf2 = NULL;
9515 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009516
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009517 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009518 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009519
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009520 kind1 = PyUnicode_KIND(str);
9521 kind2 = PyUnicode_KIND(substr);
9522 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009523 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009524
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009525 len1 = PyUnicode_GET_LENGTH(str);
9526 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009527 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009528 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009529 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009530
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009531 buf1 = PyUnicode_DATA(str);
9532 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009533 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009534 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009535 if (!buf2)
9536 goto onError;
9537 }
9538
9539 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009540 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009541 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009542 result = asciilib_count(
9543 ((Py_UCS1*)buf1) + start, end - start,
9544 buf2, len2, PY_SSIZE_T_MAX
9545 );
9546 else
9547 result = ucs1lib_count(
9548 ((Py_UCS1*)buf1) + start, end - start,
9549 buf2, len2, PY_SSIZE_T_MAX
9550 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009551 break;
9552 case PyUnicode_2BYTE_KIND:
9553 result = ucs2lib_count(
9554 ((Py_UCS2*)buf1) + start, end - start,
9555 buf2, len2, PY_SSIZE_T_MAX
9556 );
9557 break;
9558 case PyUnicode_4BYTE_KIND:
9559 result = ucs4lib_count(
9560 ((Py_UCS4*)buf1) + start, end - start,
9561 buf2, len2, PY_SSIZE_T_MAX
9562 );
9563 break;
9564 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009565 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009567
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009568 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009569 PyMem_Free(buf2);
9570
Guido van Rossumd57fd912000-03-10 22:53:23 +00009571 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009572 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009573 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009574 PyMem_Free(buf2);
9575 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576}
9577
Alexander Belopolsky40018472011-02-26 01:02:56 +00009578Py_ssize_t
9579PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009580 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009581 Py_ssize_t start,
9582 Py_ssize_t end,
9583 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009584{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009585 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009586 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009587
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009588 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009589}
9590
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009591Py_ssize_t
9592PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9593 Py_ssize_t start, Py_ssize_t end,
9594 int direction)
9595{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009597 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 if (PyUnicode_READY(str) == -1)
9599 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009600 len = PyUnicode_GET_LENGTH(str);
9601 ADJUST_INDICES(start, end, len);
9602 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009603 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009604 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009605 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9606 kind, end-start, ch, direction);
9607 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009608 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009609 else
9610 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009611}
9612
Alexander Belopolsky40018472011-02-26 01:02:56 +00009613static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009614tailmatch(PyObject *self,
9615 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009616 Py_ssize_t start,
9617 Py_ssize_t end,
9618 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009619{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009620 int kind_self;
9621 int kind_sub;
9622 void *data_self;
9623 void *data_sub;
9624 Py_ssize_t offset;
9625 Py_ssize_t i;
9626 Py_ssize_t end_sub;
9627
9628 if (PyUnicode_READY(self) == -1 ||
9629 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009630 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9633 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009634 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009635 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009636
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009637 if (PyUnicode_GET_LENGTH(substring) == 0)
9638 return 1;
9639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009640 kind_self = PyUnicode_KIND(self);
9641 data_self = PyUnicode_DATA(self);
9642 kind_sub = PyUnicode_KIND(substring);
9643 data_sub = PyUnicode_DATA(substring);
9644 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9645
9646 if (direction > 0)
9647 offset = end;
9648 else
9649 offset = start;
9650
9651 if (PyUnicode_READ(kind_self, data_self, offset) ==
9652 PyUnicode_READ(kind_sub, data_sub, 0) &&
9653 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9654 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9655 /* If both are of the same kind, memcmp is sufficient */
9656 if (kind_self == kind_sub) {
9657 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009658 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009659 data_sub,
9660 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009661 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009663 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 else {
9665 /* We do not need to compare 0 and len(substring)-1 because
9666 the if statement above ensured already that they are equal
9667 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 for (i = 1; i < end_sub; ++i) {
9669 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9670 PyUnicode_READ(kind_sub, data_sub, i))
9671 return 0;
9672 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009673 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009674 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009675 }
9676
9677 return 0;
9678}
9679
Alexander Belopolsky40018472011-02-26 01:02:56 +00009680Py_ssize_t
9681PyUnicode_Tailmatch(PyObject *str,
9682 PyObject *substr,
9683 Py_ssize_t start,
9684 Py_ssize_t end,
9685 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009686{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009687 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009688 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009689
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009690 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009691}
9692
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009693static PyObject *
9694ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009695{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009696 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9697 char *resdata, *data = PyUnicode_DATA(self);
9698 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009699
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009700 res = PyUnicode_New(len, 127);
9701 if (res == NULL)
9702 return NULL;
9703 resdata = PyUnicode_DATA(res);
9704 if (lower)
9705 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009706 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 _Py_bytes_upper(resdata, data, len);
9708 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009709}
9710
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009711static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009712handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009713{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009714 Py_ssize_t j;
9715 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009716 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009717 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009718
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009719 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9720
9721 where ! is a negation and \p{xxx} is a character with property xxx.
9722 */
9723 for (j = i - 1; j >= 0; j--) {
9724 c = PyUnicode_READ(kind, data, j);
9725 if (!_PyUnicode_IsCaseIgnorable(c))
9726 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009727 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009728 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9729 if (final_sigma) {
9730 for (j = i + 1; j < length; j++) {
9731 c = PyUnicode_READ(kind, data, j);
9732 if (!_PyUnicode_IsCaseIgnorable(c))
9733 break;
9734 }
9735 final_sigma = j == length || !_PyUnicode_IsCased(c);
9736 }
9737 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738}
9739
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009740static int
9741lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9742 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009744 /* Obscure special case. */
9745 if (c == 0x3A3) {
9746 mapped[0] = handle_capital_sigma(kind, data, length, i);
9747 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009750}
9751
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009752static Py_ssize_t
9753do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 Py_ssize_t i, k = 0;
9756 int n_res, j;
9757 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009758
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009760 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009761 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009762 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009763 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009765 for (i = 1; i < length; i++) {
9766 c = PyUnicode_READ(kind, data, i);
9767 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9768 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009769 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009770 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009771 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009772 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009773 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009774}
9775
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009776static Py_ssize_t
9777do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9778 Py_ssize_t i, k = 0;
9779
9780 for (i = 0; i < length; i++) {
9781 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9782 int n_res, j;
9783 if (Py_UNICODE_ISUPPER(c)) {
9784 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9785 }
9786 else if (Py_UNICODE_ISLOWER(c)) {
9787 n_res = _PyUnicode_ToUpperFull(c, mapped);
9788 }
9789 else {
9790 n_res = 1;
9791 mapped[0] = c;
9792 }
9793 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009794 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009795 res[k++] = mapped[j];
9796 }
9797 }
9798 return k;
9799}
9800
9801static Py_ssize_t
9802do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9803 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009805 Py_ssize_t i, k = 0;
9806
9807 for (i = 0; i < length; i++) {
9808 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9809 int n_res, j;
9810 if (lower)
9811 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9812 else
9813 n_res = _PyUnicode_ToUpperFull(c, mapped);
9814 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009815 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 res[k++] = mapped[j];
9817 }
9818 }
9819 return k;
9820}
9821
9822static Py_ssize_t
9823do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9824{
9825 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9826}
9827
9828static Py_ssize_t
9829do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9830{
9831 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9832}
9833
Benjamin Petersone51757f2012-01-12 21:10:29 -05009834static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009835do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9836{
9837 Py_ssize_t i, k = 0;
9838
9839 for (i = 0; i < length; i++) {
9840 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9841 Py_UCS4 mapped[3];
9842 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9843 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009844 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009845 res[k++] = mapped[j];
9846 }
9847 }
9848 return k;
9849}
9850
9851static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009852do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9853{
9854 Py_ssize_t i, k = 0;
9855 int previous_is_cased;
9856
9857 previous_is_cased = 0;
9858 for (i = 0; i < length; i++) {
9859 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9860 Py_UCS4 mapped[3];
9861 int n_res, j;
9862
9863 if (previous_is_cased)
9864 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9865 else
9866 n_res = _PyUnicode_ToTitleFull(c, mapped);
9867
9868 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009869 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009870 res[k++] = mapped[j];
9871 }
9872
9873 previous_is_cased = _PyUnicode_IsCased(c);
9874 }
9875 return k;
9876}
9877
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009878static PyObject *
9879case_operation(PyObject *self,
9880 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9881{
9882 PyObject *res = NULL;
9883 Py_ssize_t length, newlength = 0;
9884 int kind, outkind;
9885 void *data, *outdata;
9886 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9887
Benjamin Petersoneea48462012-01-16 14:28:50 -05009888 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009889
9890 kind = PyUnicode_KIND(self);
9891 data = PyUnicode_DATA(self);
9892 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009893 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009894 PyErr_SetString(PyExc_OverflowError, "string is too long");
9895 return NULL;
9896 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009897 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009898 if (tmp == NULL)
9899 return PyErr_NoMemory();
9900 newlength = perform(kind, data, length, tmp, &maxchar);
9901 res = PyUnicode_New(newlength, maxchar);
9902 if (res == NULL)
9903 goto leave;
9904 tmpend = tmp + newlength;
9905 outdata = PyUnicode_DATA(res);
9906 outkind = PyUnicode_KIND(res);
9907 switch (outkind) {
9908 case PyUnicode_1BYTE_KIND:
9909 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9910 break;
9911 case PyUnicode_2BYTE_KIND:
9912 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9913 break;
9914 case PyUnicode_4BYTE_KIND:
9915 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9916 break;
9917 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009918 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009919 }
9920 leave:
9921 PyMem_FREE(tmp);
9922 return res;
9923}
9924
Tim Peters8ce9f162004-08-27 01:49:32 +00009925PyObject *
9926PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009927{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009928 PyObject *res;
9929 PyObject *fseq;
9930 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009931 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009932
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009933 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009934 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009935 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009936 }
9937
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009938 /* NOTE: the following code can't call back into Python code,
9939 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009940 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009941
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009942 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009943 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009944 res = _PyUnicode_JoinArray(separator, items, seqlen);
9945 Py_DECREF(fseq);
9946 return res;
9947}
9948
9949PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009950_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009951{
9952 PyObject *res = NULL; /* the result */
9953 PyObject *sep = NULL;
9954 Py_ssize_t seplen;
9955 PyObject *item;
9956 Py_ssize_t sz, i, res_offset;
9957 Py_UCS4 maxchar;
9958 Py_UCS4 item_maxchar;
9959 int use_memcpy;
9960 unsigned char *res_data = NULL, *sep_data = NULL;
9961 PyObject *last_obj;
9962 unsigned int kind = 0;
9963
Tim Peters05eba1f2004-08-27 21:32:02 +00009964 /* If empty sequence, return u"". */
9965 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009966 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009967 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009968
Tim Peters05eba1f2004-08-27 21:32:02 +00009969 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009970 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009971 if (seqlen == 1) {
9972 if (PyUnicode_CheckExact(items[0])) {
9973 res = items[0];
9974 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009975 return res;
9976 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009977 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009978 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009979 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009980 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009981 /* Set up sep and seplen */
9982 if (separator == NULL) {
9983 /* fall back to a blank space separator */
9984 sep = PyUnicode_FromOrdinal(' ');
9985 if (!sep)
9986 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009987 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009988 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009989 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009990 else {
9991 if (!PyUnicode_Check(separator)) {
9992 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009993 "separator: expected str instance,"
9994 " %.80s found",
9995 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009996 goto onError;
9997 }
9998 if (PyUnicode_READY(separator))
9999 goto onError;
10000 sep = separator;
10001 seplen = PyUnicode_GET_LENGTH(separator);
10002 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10003 /* inc refcount to keep this code path symmetric with the
10004 above case of a blank separator */
10005 Py_INCREF(sep);
10006 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010007 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010008 }
10009
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010010 /* There are at least two things to join, or else we have a subclass
10011 * of str in the sequence.
10012 * Do a pre-pass to figure out the total amount of space we'll
10013 * need (sz), and see whether all argument are strings.
10014 */
10015 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010016#ifdef Py_DEBUG
10017 use_memcpy = 0;
10018#else
10019 use_memcpy = 1;
10020#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010021 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010022 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010023 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010024 if (!PyUnicode_Check(item)) {
10025 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010026 "sequence item %zd: expected str instance,"
10027 " %.80s found",
10028 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010029 goto onError;
10030 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010031 if (PyUnicode_READY(item) == -1)
10032 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010033 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010034 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010035 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010036 if (i != 0) {
10037 add_sz += seplen;
10038 }
10039 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010040 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010041 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010042 goto onError;
10043 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010044 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010045 if (use_memcpy && last_obj != NULL) {
10046 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10047 use_memcpy = 0;
10048 }
10049 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010050 }
Tim Petersced69f82003-09-16 20:30:58 +000010051
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010052 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010053 if (res == NULL)
10054 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010055
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010056 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010057#ifdef Py_DEBUG
10058 use_memcpy = 0;
10059#else
10060 if (use_memcpy) {
10061 res_data = PyUnicode_1BYTE_DATA(res);
10062 kind = PyUnicode_KIND(res);
10063 if (seplen != 0)
10064 sep_data = PyUnicode_1BYTE_DATA(sep);
10065 }
10066#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010067 if (use_memcpy) {
10068 for (i = 0; i < seqlen; ++i) {
10069 Py_ssize_t itemlen;
10070 item = items[i];
10071
10072 /* Copy item, and maybe the separator. */
10073 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010074 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010075 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010076 kind * seplen);
10077 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010078 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010079
10080 itemlen = PyUnicode_GET_LENGTH(item);
10081 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010082 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010083 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010084 kind * itemlen);
10085 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010086 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010087 }
10088 assert(res_data == PyUnicode_1BYTE_DATA(res)
10089 + kind * PyUnicode_GET_LENGTH(res));
10090 }
10091 else {
10092 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10093 Py_ssize_t itemlen;
10094 item = items[i];
10095
10096 /* Copy item, and maybe the separator. */
10097 if (i && seplen != 0) {
10098 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10099 res_offset += seplen;
10100 }
10101
10102 itemlen = PyUnicode_GET_LENGTH(item);
10103 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010104 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010105 res_offset += itemlen;
10106 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010107 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010108 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010109 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010110
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010111 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010112 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010113 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010114
Benjamin Peterson29060642009-01-31 22:14:21 +000010115 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010116 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010117 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010118 return NULL;
10119}
10120
Victor Stinnerd3f08822012-05-29 12:57:52 +020010121void
10122_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10123 Py_UCS4 fill_char)
10124{
10125 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010126 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010127 assert(PyUnicode_IS_READY(unicode));
10128 assert(unicode_modifiable(unicode));
10129 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10130 assert(start >= 0);
10131 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010132 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010133}
10134
Victor Stinner3fe55312012-01-04 00:33:50 +010010135Py_ssize_t
10136PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10137 Py_UCS4 fill_char)
10138{
10139 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010140
10141 if (!PyUnicode_Check(unicode)) {
10142 PyErr_BadInternalCall();
10143 return -1;
10144 }
10145 if (PyUnicode_READY(unicode) == -1)
10146 return -1;
10147 if (unicode_check_modifiable(unicode))
10148 return -1;
10149
Victor Stinnerd3f08822012-05-29 12:57:52 +020010150 if (start < 0) {
10151 PyErr_SetString(PyExc_IndexError, "string index out of range");
10152 return -1;
10153 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010154 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10155 PyErr_SetString(PyExc_ValueError,
10156 "fill character is bigger than "
10157 "the string maximum character");
10158 return -1;
10159 }
10160
10161 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10162 length = Py_MIN(maxlen, length);
10163 if (length <= 0)
10164 return 0;
10165
Victor Stinnerd3f08822012-05-29 12:57:52 +020010166 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010167 return length;
10168}
10169
Victor Stinner9310abb2011-10-05 00:59:23 +020010170static PyObject *
10171pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010172 Py_ssize_t left,
10173 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010174 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010175{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010176 PyObject *u;
10177 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010178 int kind;
10179 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
10181 if (left < 0)
10182 left = 0;
10183 if (right < 0)
10184 right = 0;
10185
Victor Stinnerc4b49542011-12-11 22:44:26 +010010186 if (left == 0 && right == 0)
10187 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010188
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010189 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10190 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010191 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10192 return NULL;
10193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010194 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010195 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010196 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010197 if (!u)
10198 return NULL;
10199
10200 kind = PyUnicode_KIND(u);
10201 data = PyUnicode_DATA(u);
10202 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010203 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010204 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010205 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010206 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010207 assert(_PyUnicode_CheckConsistency(u, 1));
10208 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209}
10210
Alexander Belopolsky40018472011-02-26 01:02:56 +000010211PyObject *
10212PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010213{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010214 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010215
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010216 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010217 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010218
Benjamin Petersonead6b532011-12-20 17:23:42 -060010219 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010221 if (PyUnicode_IS_ASCII(string))
10222 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010223 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010224 PyUnicode_GET_LENGTH(string), keepends);
10225 else
10226 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010227 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010228 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010229 break;
10230 case PyUnicode_2BYTE_KIND:
10231 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010232 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010233 PyUnicode_GET_LENGTH(string), keepends);
10234 break;
10235 case PyUnicode_4BYTE_KIND:
10236 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010237 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010238 PyUnicode_GET_LENGTH(string), keepends);
10239 break;
10240 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010241 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010243 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244}
10245
Alexander Belopolsky40018472011-02-26 01:02:56 +000010246static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010247split(PyObject *self,
10248 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010249 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010251 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 void *buf1, *buf2;
10253 Py_ssize_t len1, len2;
10254 PyObject* out;
10255
Guido van Rossumd57fd912000-03-10 22:53:23 +000010256 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010257 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010258
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010259 if (PyUnicode_READY(self) == -1)
10260 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010261
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010262 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010263 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010264 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010265 if (PyUnicode_IS_ASCII(self))
10266 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
10270 else
10271 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 case PyUnicode_2BYTE_KIND:
10276 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010277 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010278 PyUnicode_GET_LENGTH(self), maxcount
10279 );
10280 case PyUnicode_4BYTE_KIND:
10281 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010282 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 PyUnicode_GET_LENGTH(self), maxcount
10284 );
10285 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010286 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010287 }
10288
10289 if (PyUnicode_READY(substring) == -1)
10290 return NULL;
10291
10292 kind1 = PyUnicode_KIND(self);
10293 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010294 len1 = PyUnicode_GET_LENGTH(self);
10295 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010296 if (kind1 < kind2 || len1 < len2) {
10297 out = PyList_New(1);
10298 if (out == NULL)
10299 return NULL;
10300 Py_INCREF(self);
10301 PyList_SET_ITEM(out, 0, self);
10302 return out;
10303 }
10304 buf1 = PyUnicode_DATA(self);
10305 buf2 = PyUnicode_DATA(substring);
10306 if (kind2 != kind1) {
10307 buf2 = _PyUnicode_AsKind(substring, kind1);
10308 if (!buf2)
10309 return NULL;
10310 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010311
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010312 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010313 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010314 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10315 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010316 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010317 else
10318 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 break;
10321 case PyUnicode_2BYTE_KIND:
10322 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010323 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010324 break;
10325 case PyUnicode_4BYTE_KIND:
10326 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010327 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010328 break;
10329 default:
10330 out = NULL;
10331 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010332 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 PyMem_Free(buf2);
10334 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010335}
10336
Alexander Belopolsky40018472011-02-26 01:02:56 +000010337static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010338rsplit(PyObject *self,
10339 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010340 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010341{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010342 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 void *buf1, *buf2;
10344 Py_ssize_t len1, len2;
10345 PyObject* out;
10346
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010347 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010348 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010350 if (PyUnicode_READY(self) == -1)
10351 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010352
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010353 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010354 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010355 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356 if (PyUnicode_IS_ASCII(self))
10357 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 else
10362 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010363 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010364 PyUnicode_GET_LENGTH(self), maxcount
10365 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 case PyUnicode_2BYTE_KIND:
10367 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010368 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010369 PyUnicode_GET_LENGTH(self), maxcount
10370 );
10371 case PyUnicode_4BYTE_KIND:
10372 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010373 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010374 PyUnicode_GET_LENGTH(self), maxcount
10375 );
10376 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010377 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010378 }
10379
10380 if (PyUnicode_READY(substring) == -1)
10381 return NULL;
10382
10383 kind1 = PyUnicode_KIND(self);
10384 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010385 len1 = PyUnicode_GET_LENGTH(self);
10386 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010387 if (kind1 < kind2 || len1 < len2) {
10388 out = PyList_New(1);
10389 if (out == NULL)
10390 return NULL;
10391 Py_INCREF(self);
10392 PyList_SET_ITEM(out, 0, self);
10393 return out;
10394 }
10395 buf1 = PyUnicode_DATA(self);
10396 buf2 = PyUnicode_DATA(substring);
10397 if (kind2 != kind1) {
10398 buf2 = _PyUnicode_AsKind(substring, kind1);
10399 if (!buf2)
10400 return NULL;
10401 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010402
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010403 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010404 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010405 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10406 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010407 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010408 else
10409 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010410 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 break;
10412 case PyUnicode_2BYTE_KIND:
10413 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010414 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010415 break;
10416 case PyUnicode_4BYTE_KIND:
10417 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010418 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010419 break;
10420 default:
10421 out = NULL;
10422 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010423 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010424 PyMem_Free(buf2);
10425 return out;
10426}
10427
10428static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010429anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10430 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010431{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010432 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010434 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10435 return asciilib_find(buf1, len1, buf2, len2, offset);
10436 else
10437 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438 case PyUnicode_2BYTE_KIND:
10439 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10440 case PyUnicode_4BYTE_KIND:
10441 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10442 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010443 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010444}
10445
10446static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010447anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10448 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010449{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010450 switch (kind) {
10451 case PyUnicode_1BYTE_KIND:
10452 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10453 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10454 else
10455 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10456 case PyUnicode_2BYTE_KIND:
10457 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10458 case PyUnicode_4BYTE_KIND:
10459 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10460 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010461 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010462}
10463
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010464static void
10465replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10466 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10467{
10468 int kind = PyUnicode_KIND(u);
10469 void *data = PyUnicode_DATA(u);
10470 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10471 if (kind == PyUnicode_1BYTE_KIND) {
10472 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10473 (Py_UCS1 *)data + len,
10474 u1, u2, maxcount);
10475 }
10476 else if (kind == PyUnicode_2BYTE_KIND) {
10477 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10478 (Py_UCS2 *)data + len,
10479 u1, u2, maxcount);
10480 }
10481 else {
10482 assert(kind == PyUnicode_4BYTE_KIND);
10483 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10484 (Py_UCS4 *)data + len,
10485 u1, u2, maxcount);
10486 }
10487}
10488
Alexander Belopolsky40018472011-02-26 01:02:56 +000010489static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010490replace(PyObject *self, PyObject *str1,
10491 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010492{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010493 PyObject *u;
10494 char *sbuf = PyUnicode_DATA(self);
10495 char *buf1 = PyUnicode_DATA(str1);
10496 char *buf2 = PyUnicode_DATA(str2);
10497 int srelease = 0, release1 = 0, release2 = 0;
10498 int skind = PyUnicode_KIND(self);
10499 int kind1 = PyUnicode_KIND(str1);
10500 int kind2 = PyUnicode_KIND(str2);
10501 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10502 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10503 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010504 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010505 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010506
10507 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010508 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010509 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010510 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010511
Victor Stinner59de0ee2011-10-07 10:01:28 +020010512 if (str1 == str2)
10513 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010514
Victor Stinner49a0a212011-10-12 23:46:10 +020010515 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010516 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10517 if (maxchar < maxchar_str1)
10518 /* substring too wide to be present */
10519 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010520 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10521 /* Replacing str1 with str2 may cause a maxchar reduction in the
10522 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010523 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010524 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010525
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010526 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010527 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010529 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010531 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010532 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010533 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010534
Victor Stinner69ed0f42013-04-09 21:48:24 +020010535 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010536 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010537 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010538 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010539 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010541 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010543
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010544 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10545 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010546 }
10547 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010548 int rkind = skind;
10549 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010550 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010552 if (kind1 < rkind) {
10553 /* widen substring */
10554 buf1 = _PyUnicode_AsKind(str1, rkind);
10555 if (!buf1) goto error;
10556 release1 = 1;
10557 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010558 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010559 if (i < 0)
10560 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010561 if (rkind > kind2) {
10562 /* widen replacement */
10563 buf2 = _PyUnicode_AsKind(str2, rkind);
10564 if (!buf2) goto error;
10565 release2 = 1;
10566 }
10567 else if (rkind < kind2) {
10568 /* widen self and buf1 */
10569 rkind = kind2;
10570 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010571 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010572 sbuf = _PyUnicode_AsKind(self, rkind);
10573 if (!sbuf) goto error;
10574 srelease = 1;
10575 buf1 = _PyUnicode_AsKind(str1, rkind);
10576 if (!buf1) goto error;
10577 release1 = 1;
10578 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010579 u = PyUnicode_New(slen, maxchar);
10580 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010581 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010582 assert(PyUnicode_KIND(u) == rkind);
10583 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010584
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010585 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010586 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010587 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010588 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010589 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010590 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010591
10592 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010593 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010594 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010595 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010596 if (i == -1)
10597 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010598 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010600 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010602 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010603 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010604 }
10605 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010606 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010607 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010608 int rkind = skind;
10609 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010610
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010611 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010612 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010613 buf1 = _PyUnicode_AsKind(str1, rkind);
10614 if (!buf1) goto error;
10615 release1 = 1;
10616 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010617 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010618 if (n == 0)
10619 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010621 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 buf2 = _PyUnicode_AsKind(str2, rkind);
10623 if (!buf2) goto error;
10624 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010625 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010627 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 rkind = kind2;
10629 sbuf = _PyUnicode_AsKind(self, rkind);
10630 if (!sbuf) goto error;
10631 srelease = 1;
10632 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010633 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010634 buf1 = _PyUnicode_AsKind(str1, rkind);
10635 if (!buf1) goto error;
10636 release1 = 1;
10637 }
10638 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10639 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010640 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010641 PyErr_SetString(PyExc_OverflowError,
10642 "replace string is too long");
10643 goto error;
10644 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010645 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010646 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010647 _Py_INCREF_UNICODE_EMPTY();
10648 if (!unicode_empty)
10649 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010650 u = unicode_empty;
10651 goto done;
10652 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010653 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 PyErr_SetString(PyExc_OverflowError,
10655 "replace string is too long");
10656 goto error;
10657 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010658 u = PyUnicode_New(new_size, maxchar);
10659 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010661 assert(PyUnicode_KIND(u) == rkind);
10662 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010663 ires = i = 0;
10664 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 while (n-- > 0) {
10666 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010667 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010669 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010670 if (j == -1)
10671 break;
10672 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010673 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010674 memcpy(res + rkind * ires,
10675 sbuf + rkind * i,
10676 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010677 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010678 }
10679 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010681 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010683 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010684 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010685 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010686 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010690 memcpy(res + rkind * ires,
10691 sbuf + rkind * i,
10692 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010693 }
10694 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695 /* interleave */
10696 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010697 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010699 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010701 if (--n <= 0)
10702 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010703 memcpy(res + rkind * ires,
10704 sbuf + rkind * i,
10705 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010706 ires++;
10707 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010708 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010709 memcpy(res + rkind * ires,
10710 sbuf + rkind * i,
10711 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010712 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010713 }
10714
10715 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010716 unicode_adjust_maxchar(&u);
10717 if (u == NULL)
10718 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010719 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010720
10721 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010722 if (srelease)
10723 PyMem_FREE(sbuf);
10724 if (release1)
10725 PyMem_FREE(buf1);
10726 if (release2)
10727 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010728 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010730
Benjamin Peterson29060642009-01-31 22:14:21 +000010731 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010732 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 if (srelease)
10734 PyMem_FREE(sbuf);
10735 if (release1)
10736 PyMem_FREE(buf1);
10737 if (release2)
10738 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010739 return unicode_result_unchanged(self);
10740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 error:
10742 if (srelease && sbuf)
10743 PyMem_FREE(sbuf);
10744 if (release1 && buf1)
10745 PyMem_FREE(buf1);
10746 if (release2 && buf2)
10747 PyMem_FREE(buf2);
10748 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749}
10750
10751/* --- Unicode Object Methods --------------------------------------------- */
10752
INADA Naoki3ae20562017-01-16 20:41:20 +090010753/*[clinic input]
10754str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755
INADA Naoki3ae20562017-01-16 20:41:20 +090010756Return a version of the string where each word is titlecased.
10757
10758More specifically, words start with uppercased characters and all remaining
10759cased characters have lower case.
10760[clinic start generated code]*/
10761
10762static PyObject *
10763unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010764/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010766 if (PyUnicode_READY(self) == -1)
10767 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010768 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010769}
10770
INADA Naoki3ae20562017-01-16 20:41:20 +090010771/*[clinic input]
10772str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773
INADA Naoki3ae20562017-01-16 20:41:20 +090010774Return a capitalized version of the string.
10775
10776More specifically, make the first character have upper case and the rest lower
10777case.
10778[clinic start generated code]*/
10779
10780static PyObject *
10781unicode_capitalize_impl(PyObject *self)
10782/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010784 if (PyUnicode_READY(self) == -1)
10785 return NULL;
10786 if (PyUnicode_GET_LENGTH(self) == 0)
10787 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010788 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010789}
10790
INADA Naoki3ae20562017-01-16 20:41:20 +090010791/*[clinic input]
10792str.casefold as unicode_casefold
10793
10794Return a version of the string suitable for caseless comparisons.
10795[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010796
10797static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010798unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010799/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010800{
10801 if (PyUnicode_READY(self) == -1)
10802 return NULL;
10803 if (PyUnicode_IS_ASCII(self))
10804 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010805 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010806}
10807
10808
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010809/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010810
10811static int
10812convert_uc(PyObject *obj, void *addr)
10813{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010814 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010815
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010816 if (!PyUnicode_Check(obj)) {
10817 PyErr_Format(PyExc_TypeError,
10818 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010819 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010820 return 0;
10821 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010822 if (PyUnicode_READY(obj) < 0)
10823 return 0;
10824 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010825 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010826 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010827 return 0;
10828 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010829 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010830 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010831}
10832
INADA Naoki3ae20562017-01-16 20:41:20 +090010833/*[clinic input]
10834str.center as unicode_center
10835
10836 width: Py_ssize_t
10837 fillchar: Py_UCS4 = ' '
10838 /
10839
10840Return a centered string of length width.
10841
10842Padding is done using the specified fill character (default is a space).
10843[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844
10845static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010846unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10847/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010849 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850
Benjamin Petersonbac79492012-01-14 13:34:47 -050010851 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 return NULL;
10853
Victor Stinnerc4b49542011-12-11 22:44:26 +010010854 if (PyUnicode_GET_LENGTH(self) >= width)
10855 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010856
Victor Stinnerc4b49542011-12-11 22:44:26 +010010857 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010858 left = marg / 2 + (marg & width & 1);
10859
Victor Stinner9310abb2011-10-05 00:59:23 +020010860 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010861}
10862
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010863/* This function assumes that str1 and str2 are readied by the caller. */
10864
Marc-André Lemburge5034372000-08-08 08:04:29 +000010865static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010866unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010867{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010868#define COMPARE(TYPE1, TYPE2) \
10869 do { \
10870 TYPE1* p1 = (TYPE1 *)data1; \
10871 TYPE2* p2 = (TYPE2 *)data2; \
10872 TYPE1* end = p1 + len; \
10873 Py_UCS4 c1, c2; \
10874 for (; p1 != end; p1++, p2++) { \
10875 c1 = *p1; \
10876 c2 = *p2; \
10877 if (c1 != c2) \
10878 return (c1 < c2) ? -1 : 1; \
10879 } \
10880 } \
10881 while (0)
10882
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010883 int kind1, kind2;
10884 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010886
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010887 kind1 = PyUnicode_KIND(str1);
10888 kind2 = PyUnicode_KIND(str2);
10889 data1 = PyUnicode_DATA(str1);
10890 data2 = PyUnicode_DATA(str2);
10891 len1 = PyUnicode_GET_LENGTH(str1);
10892 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010893 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010894
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010895 switch(kind1) {
10896 case PyUnicode_1BYTE_KIND:
10897 {
10898 switch(kind2) {
10899 case PyUnicode_1BYTE_KIND:
10900 {
10901 int cmp = memcmp(data1, data2, len);
10902 /* normalize result of memcmp() into the range [-1; 1] */
10903 if (cmp < 0)
10904 return -1;
10905 if (cmp > 0)
10906 return 1;
10907 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010908 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010909 case PyUnicode_2BYTE_KIND:
10910 COMPARE(Py_UCS1, Py_UCS2);
10911 break;
10912 case PyUnicode_4BYTE_KIND:
10913 COMPARE(Py_UCS1, Py_UCS4);
10914 break;
10915 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010916 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010917 }
10918 break;
10919 }
10920 case PyUnicode_2BYTE_KIND:
10921 {
10922 switch(kind2) {
10923 case PyUnicode_1BYTE_KIND:
10924 COMPARE(Py_UCS2, Py_UCS1);
10925 break;
10926 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010927 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010928 COMPARE(Py_UCS2, Py_UCS2);
10929 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010930 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010931 case PyUnicode_4BYTE_KIND:
10932 COMPARE(Py_UCS2, Py_UCS4);
10933 break;
10934 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010935 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010936 }
10937 break;
10938 }
10939 case PyUnicode_4BYTE_KIND:
10940 {
10941 switch(kind2) {
10942 case PyUnicode_1BYTE_KIND:
10943 COMPARE(Py_UCS4, Py_UCS1);
10944 break;
10945 case PyUnicode_2BYTE_KIND:
10946 COMPARE(Py_UCS4, Py_UCS2);
10947 break;
10948 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010949 {
10950#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10951 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10952 /* normalize result of wmemcmp() into the range [-1; 1] */
10953 if (cmp < 0)
10954 return -1;
10955 if (cmp > 0)
10956 return 1;
10957#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010958 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010959#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010960 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010961 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010962 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010963 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010964 }
10965 break;
10966 }
10967 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010968 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010969 }
10970
Victor Stinner770e19e2012-10-04 22:59:45 +020010971 if (len1 == len2)
10972 return 0;
10973 if (len1 < len2)
10974 return -1;
10975 else
10976 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010977
10978#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010979}
10980
Benjamin Peterson621b4302016-09-09 13:54:34 -070010981static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010982unicode_compare_eq(PyObject *str1, PyObject *str2)
10983{
10984 int kind;
10985 void *data1, *data2;
10986 Py_ssize_t len;
10987 int cmp;
10988
Victor Stinnere5567ad2012-10-23 02:48:49 +020010989 len = PyUnicode_GET_LENGTH(str1);
10990 if (PyUnicode_GET_LENGTH(str2) != len)
10991 return 0;
10992 kind = PyUnicode_KIND(str1);
10993 if (PyUnicode_KIND(str2) != kind)
10994 return 0;
10995 data1 = PyUnicode_DATA(str1);
10996 data2 = PyUnicode_DATA(str2);
10997
10998 cmp = memcmp(data1, data2, len * kind);
10999 return (cmp == 0);
11000}
11001
11002
Alexander Belopolsky40018472011-02-26 01:02:56 +000011003int
11004PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11007 if (PyUnicode_READY(left) == -1 ||
11008 PyUnicode_READY(right) == -1)
11009 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011010
11011 /* a string is equal to itself */
11012 if (left == right)
11013 return 0;
11014
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011015 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011016 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011017 PyErr_Format(PyExc_TypeError,
11018 "Can't compare %.100s and %.100s",
11019 left->ob_type->tp_name,
11020 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011021 return -1;
11022}
11023
Martin v. Löwis5b222132007-06-10 09:51:05 +000011024int
11025PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011027 Py_ssize_t i;
11028 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011029 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011030 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011031
Victor Stinner910337b2011-10-03 03:20:16 +020011032 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011033 if (!PyUnicode_IS_READY(uni)) {
11034 const wchar_t *ws = _PyUnicode_WSTR(uni);
11035 /* Compare Unicode string and source character set string */
11036 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11037 if (chr != ustr[i])
11038 return (chr < ustr[i]) ? -1 : 1;
11039 }
11040 /* This check keeps Python strings that end in '\0' from comparing equal
11041 to C strings identical up to that point. */
11042 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11043 return 1; /* uni is longer */
11044 if (ustr[i])
11045 return -1; /* str is longer */
11046 return 0;
11047 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011048 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011049 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011050 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011051 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011052 size_t len, len2 = strlen(str);
11053 int cmp;
11054
11055 len = Py_MIN(len1, len2);
11056 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011057 if (cmp != 0) {
11058 if (cmp < 0)
11059 return -1;
11060 else
11061 return 1;
11062 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011063 if (len1 > len2)
11064 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011065 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011066 return -1; /* str is longer */
11067 return 0;
11068 }
11069 else {
11070 void *data = PyUnicode_DATA(uni);
11071 /* Compare Unicode string and source character set string */
11072 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011073 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011074 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11075 /* This check keeps Python strings that end in '\0' from comparing equal
11076 to C strings identical up to that point. */
11077 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11078 return 1; /* uni is longer */
11079 if (str[i])
11080 return -1; /* str is longer */
11081 return 0;
11082 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011083}
11084
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011085static int
11086non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11087{
11088 size_t i, len;
11089 const wchar_t *p;
11090 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11091 if (strlen(str) != len)
11092 return 0;
11093 p = _PyUnicode_WSTR(unicode);
11094 assert(p);
11095 for (i = 0; i < len; i++) {
11096 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011097 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011098 return 0;
11099 }
11100 return 1;
11101}
11102
11103int
11104_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11105{
11106 size_t len;
11107 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011108 assert(str);
11109#ifndef NDEBUG
11110 for (const char *p = str; *p; p++) {
11111 assert((unsigned char)*p < 128);
11112 }
11113#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011114 if (PyUnicode_READY(unicode) == -1) {
11115 /* Memory error or bad data */
11116 PyErr_Clear();
11117 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11118 }
11119 if (!PyUnicode_IS_ASCII(unicode))
11120 return 0;
11121 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11122 return strlen(str) == len &&
11123 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11124}
11125
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011126int
11127_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11128{
11129 PyObject *right_uni;
11130 Py_hash_t hash;
11131
11132 assert(_PyUnicode_CHECK(left));
11133 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011134#ifndef NDEBUG
11135 for (const char *p = right->string; *p; p++) {
11136 assert((unsigned char)*p < 128);
11137 }
11138#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011139
11140 if (PyUnicode_READY(left) == -1) {
11141 /* memory error or bad data */
11142 PyErr_Clear();
11143 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11144 }
11145
11146 if (!PyUnicode_IS_ASCII(left))
11147 return 0;
11148
11149 right_uni = _PyUnicode_FromId(right); /* borrowed */
11150 if (right_uni == NULL) {
11151 /* memory error or bad data */
11152 PyErr_Clear();
11153 return _PyUnicode_EqualToASCIIString(left, right->string);
11154 }
11155
11156 if (left == right_uni)
11157 return 1;
11158
11159 if (PyUnicode_CHECK_INTERNED(left))
11160 return 0;
11161
INADA Naoki7cc95f52018-01-28 02:07:09 +090011162 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011163 hash = _PyUnicode_HASH(left);
11164 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11165 return 0;
11166
11167 return unicode_compare_eq(left, right_uni);
11168}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011169
Alexander Belopolsky40018472011-02-26 01:02:56 +000011170PyObject *
11171PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011172{
11173 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011174
Victor Stinnere5567ad2012-10-23 02:48:49 +020011175 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11176 Py_RETURN_NOTIMPLEMENTED;
11177
11178 if (PyUnicode_READY(left) == -1 ||
11179 PyUnicode_READY(right) == -1)
11180 return NULL;
11181
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011182 if (left == right) {
11183 switch (op) {
11184 case Py_EQ:
11185 case Py_LE:
11186 case Py_GE:
11187 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011188 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011189 case Py_NE:
11190 case Py_LT:
11191 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011192 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011193 default:
11194 PyErr_BadArgument();
11195 return NULL;
11196 }
11197 }
11198 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011199 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011200 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011201 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011202 }
11203 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011204 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011205 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011206 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011207}
11208
Alexander Belopolsky40018472011-02-26 01:02:56 +000011209int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011210_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11211{
11212 return unicode_eq(aa, bb);
11213}
11214
11215int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011216PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011217{
Victor Stinner77282cb2013-04-14 19:22:47 +020011218 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011219 void *buf1, *buf2;
11220 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011221 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011222
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011223 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011224 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011225 "'in <string>' requires string as left operand, not %.100s",
11226 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011227 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011228 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011230 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 if (ensure_unicode(str) < 0)
11232 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011233
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011234 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235 kind2 = PyUnicode_KIND(substr);
11236 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011237 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011238 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011239 len2 = PyUnicode_GET_LENGTH(substr);
11240 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011241 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011242 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011243 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 if (len2 == 1) {
11245 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11246 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011247 return result;
11248 }
11249 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011250 buf2 = _PyUnicode_AsKind(substr, kind1);
11251 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011252 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011253 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011254
Victor Stinner77282cb2013-04-14 19:22:47 +020011255 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011256 case PyUnicode_1BYTE_KIND:
11257 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11258 break;
11259 case PyUnicode_2BYTE_KIND:
11260 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11261 break;
11262 case PyUnicode_4BYTE_KIND:
11263 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11264 break;
11265 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011266 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011267 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011268
Victor Stinner77282cb2013-04-14 19:22:47 +020011269 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011270 PyMem_Free(buf2);
11271
Guido van Rossum403d68b2000-03-13 15:55:09 +000011272 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011273}
11274
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275/* Concat to string or Unicode object giving a new Unicode object. */
11276
Alexander Belopolsky40018472011-02-26 01:02:56 +000011277PyObject *
11278PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011279{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011280 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011281 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011282 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011283
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011284 if (ensure_unicode(left) < 0)
11285 return NULL;
11286
11287 if (!PyUnicode_Check(right)) {
11288 PyErr_Format(PyExc_TypeError,
11289 "can only concatenate str (not \"%.200s\") to str",
11290 right->ob_type->tp_name);
11291 return NULL;
11292 }
11293 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
11296 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297 if (left == unicode_empty)
11298 return PyUnicode_FromObject(right);
11299 if (right == unicode_empty)
11300 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011301
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011302 left_len = PyUnicode_GET_LENGTH(left);
11303 right_len = PyUnicode_GET_LENGTH(right);
11304 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011305 PyErr_SetString(PyExc_OverflowError,
11306 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011307 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011308 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011309 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011310
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011311 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11312 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011313 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011314
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011316 result = PyUnicode_New(new_len, maxchar);
11317 if (result == NULL)
11318 return NULL;
11319 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11320 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11321 assert(_PyUnicode_CheckConsistency(result, 1));
11322 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011323}
11324
Walter Dörwald1ab83302007-05-18 17:15:44 +000011325void
Victor Stinner23e56682011-10-03 03:54:37 +020011326PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011327{
Victor Stinner23e56682011-10-03 03:54:37 +020011328 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011329 Py_UCS4 maxchar, maxchar2;
11330 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011331
11332 if (p_left == NULL) {
11333 if (!PyErr_Occurred())
11334 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011335 return;
11336 }
Victor Stinner23e56682011-10-03 03:54:37 +020011337 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011338 if (right == NULL || left == NULL
11339 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011340 if (!PyErr_Occurred())
11341 PyErr_BadInternalCall();
11342 goto error;
11343 }
11344
Benjamin Petersonbac79492012-01-14 13:34:47 -050011345 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011346 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011347 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011348 goto error;
11349
Victor Stinner488fa492011-12-12 00:01:39 +010011350 /* Shortcuts */
11351 if (left == unicode_empty) {
11352 Py_DECREF(left);
11353 Py_INCREF(right);
11354 *p_left = right;
11355 return;
11356 }
11357 if (right == unicode_empty)
11358 return;
11359
11360 left_len = PyUnicode_GET_LENGTH(left);
11361 right_len = PyUnicode_GET_LENGTH(right);
11362 if (left_len > PY_SSIZE_T_MAX - right_len) {
11363 PyErr_SetString(PyExc_OverflowError,
11364 "strings are too large to concat");
11365 goto error;
11366 }
11367 new_len = left_len + right_len;
11368
11369 if (unicode_modifiable(left)
11370 && PyUnicode_CheckExact(right)
11371 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011372 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11373 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011374 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011375 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011376 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11377 {
11378 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011379 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011380 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011381
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011382 /* copy 'right' into the newly allocated area of 'left' */
11383 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011384 }
Victor Stinner488fa492011-12-12 00:01:39 +010011385 else {
11386 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11387 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011388 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011389
Victor Stinner488fa492011-12-12 00:01:39 +010011390 /* Concat the two Unicode strings */
11391 res = PyUnicode_New(new_len, maxchar);
11392 if (res == NULL)
11393 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011394 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11395 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011396 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011397 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011398 }
11399 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011400 return;
11401
11402error:
Victor Stinner488fa492011-12-12 00:01:39 +010011403 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011404}
11405
11406void
11407PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11408{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011409 PyUnicode_Append(pleft, right);
11410 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011411}
11412
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011413/*
11414Wraps stringlib_parse_args_finds() and additionally ensures that the
11415first argument is a unicode object.
11416*/
11417
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011418static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011419parse_args_finds_unicode(const char * function_name, PyObject *args,
11420 PyObject **substring,
11421 Py_ssize_t *start, Py_ssize_t *end)
11422{
11423 if(stringlib_parse_args_finds(function_name, args, substring,
11424 start, end)) {
11425 if (ensure_unicode(*substring) < 0)
11426 return 0;
11427 return 1;
11428 }
11429 return 0;
11430}
11431
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011432PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011433 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011434\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011435Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011436string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011437interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011438
11439static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011440unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011442 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011443 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011444 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011445 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011446 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 void *buf1, *buf2;
11448 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011449
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011450 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011451 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011452
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011453 kind1 = PyUnicode_KIND(self);
11454 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011456 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011457
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011458 len1 = PyUnicode_GET_LENGTH(self);
11459 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011460 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011461 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011462 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011463
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011464 buf1 = PyUnicode_DATA(self);
11465 buf2 = PyUnicode_DATA(substring);
11466 if (kind2 != kind1) {
11467 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011468 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011469 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011470 }
11471 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011472 case PyUnicode_1BYTE_KIND:
11473 iresult = ucs1lib_count(
11474 ((Py_UCS1*)buf1) + start, end - start,
11475 buf2, len2, PY_SSIZE_T_MAX
11476 );
11477 break;
11478 case PyUnicode_2BYTE_KIND:
11479 iresult = ucs2lib_count(
11480 ((Py_UCS2*)buf1) + start, end - start,
11481 buf2, len2, PY_SSIZE_T_MAX
11482 );
11483 break;
11484 case PyUnicode_4BYTE_KIND:
11485 iresult = ucs4lib_count(
11486 ((Py_UCS4*)buf1) + start, end - start,
11487 buf2, len2, PY_SSIZE_T_MAX
11488 );
11489 break;
11490 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011491 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011492 }
11493
11494 result = PyLong_FromSsize_t(iresult);
11495
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011496 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011497 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011498
Guido van Rossumd57fd912000-03-10 22:53:23 +000011499 return result;
11500}
11501
INADA Naoki3ae20562017-01-16 20:41:20 +090011502/*[clinic input]
11503str.encode as unicode_encode
11504
11505 encoding: str(c_default="NULL") = 'utf-8'
11506 The encoding in which to encode the string.
11507 errors: str(c_default="NULL") = 'strict'
11508 The error handling scheme to use for encoding errors.
11509 The default is 'strict' meaning that encoding errors raise a
11510 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11511 'xmlcharrefreplace' as well as any other name registered with
11512 codecs.register_error that can handle UnicodeEncodeErrors.
11513
11514Encode the string using the codec registered for encoding.
11515[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
11517static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011518unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011519/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011521 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011522}
11523
INADA Naoki3ae20562017-01-16 20:41:20 +090011524/*[clinic input]
11525str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011526
INADA Naoki3ae20562017-01-16 20:41:20 +090011527 tabsize: int = 8
11528
11529Return a copy where all tab characters are expanded using spaces.
11530
11531If tabsize is not given, a tab size of 8 characters is assumed.
11532[clinic start generated code]*/
11533
11534static PyObject *
11535unicode_expandtabs_impl(PyObject *self, int tabsize)
11536/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011537{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011538 Py_ssize_t i, j, line_pos, src_len, incr;
11539 Py_UCS4 ch;
11540 PyObject *u;
11541 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011542 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011543 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011544
Antoine Pitrou22425222011-10-04 19:10:51 +020011545 if (PyUnicode_READY(self) == -1)
11546 return NULL;
11547
Thomas Wouters7e474022000-07-16 12:04:32 +000011548 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011549 src_len = PyUnicode_GET_LENGTH(self);
11550 i = j = line_pos = 0;
11551 kind = PyUnicode_KIND(self);
11552 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011553 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011554 for (; i < src_len; i++) {
11555 ch = PyUnicode_READ(kind, src_data, i);
11556 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011557 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011558 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011559 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011560 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 goto overflow;
11562 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011563 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011564 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011565 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011566 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011567 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 goto overflow;
11569 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 if (ch == '\n' || ch == '\r')
11572 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011574 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011575 if (!found)
11576 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011577
Guido van Rossumd57fd912000-03-10 22:53:23 +000011578 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011579 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011580 if (!u)
11581 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011582 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011583
Antoine Pitroue71d5742011-10-04 15:55:09 +020011584 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011585
Antoine Pitroue71d5742011-10-04 15:55:09 +020011586 for (; i < src_len; i++) {
11587 ch = PyUnicode_READ(kind, src_data, i);
11588 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011589 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011590 incr = tabsize - (line_pos % tabsize);
11591 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011592 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011593 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011594 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011595 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011596 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 line_pos++;
11598 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011599 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011600 if (ch == '\n' || ch == '\r')
11601 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011603 }
11604 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011605 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011606
Antoine Pitroue71d5742011-10-04 15:55:09 +020011607 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011608 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11609 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610}
11611
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011612PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011613 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614\n\
11615Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011616such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617arguments start and end are interpreted as in slice notation.\n\
11618\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011619Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011620
11621static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011622unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011624 /* initialize variables to prevent gcc warning */
11625 PyObject *substring = NULL;
11626 Py_ssize_t start = 0;
11627 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011628 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011629
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011630 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011631 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011633 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011634 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011635
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011636 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011638 if (result == -2)
11639 return NULL;
11640
Christian Heimes217cfd12007-12-02 14:31:20 +000011641 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011642}
11643
11644static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011645unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011646{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011647 void *data;
11648 enum PyUnicode_Kind kind;
11649 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011650
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011651 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011652 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011653 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011654 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011655 if (PyUnicode_READY(self) == -1) {
11656 return NULL;
11657 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011658 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11659 PyErr_SetString(PyExc_IndexError, "string index out of range");
11660 return NULL;
11661 }
11662 kind = PyUnicode_KIND(self);
11663 data = PyUnicode_DATA(self);
11664 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011665 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666}
11667
Guido van Rossumc2504932007-09-18 19:42:40 +000011668/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011669 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011670static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011671unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011672{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011673 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011674
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011675#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011676 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011677#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011678 if (_PyUnicode_HASH(self) != -1)
11679 return _PyUnicode_HASH(self);
11680 if (PyUnicode_READY(self) == -1)
11681 return -1;
animalizea1d14252019-01-02 20:16:06 +080011682
Christian Heimes985ecdc2013-11-20 11:46:18 +010011683 x = _Py_HashBytes(PyUnicode_DATA(self),
11684 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011685 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011686 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687}
11688
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011689PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011690 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691\n\
oldkaa0735f2018-02-02 16:52:55 +080011692Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011693such that sub is contained within S[start:end]. Optional\n\
11694arguments start and end are interpreted as in slice notation.\n\
11695\n\
11696Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011697
11698static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011699unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011701 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011702 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011703 PyObject *substring = NULL;
11704 Py_ssize_t start = 0;
11705 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011706
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011707 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011709
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011710 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011711 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011712
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011713 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011714
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011715 if (result == -2)
11716 return NULL;
11717
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718 if (result < 0) {
11719 PyErr_SetString(PyExc_ValueError, "substring not found");
11720 return NULL;
11721 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011722
Christian Heimes217cfd12007-12-02 14:31:20 +000011723 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011724}
11725
INADA Naoki3ae20562017-01-16 20:41:20 +090011726/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011727str.isascii as unicode_isascii
11728
11729Return True if all characters in the string are ASCII, False otherwise.
11730
11731ASCII characters have code points in the range U+0000-U+007F.
11732Empty string is ASCII too.
11733[clinic start generated code]*/
11734
11735static PyObject *
11736unicode_isascii_impl(PyObject *self)
11737/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11738{
11739 if (PyUnicode_READY(self) == -1) {
11740 return NULL;
11741 }
11742 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11743}
11744
11745/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011746str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
INADA Naoki3ae20562017-01-16 20:41:20 +090011748Return True if the string is a lowercase string, False otherwise.
11749
11750A string is lowercase if all cased characters in the string are lowercase and
11751there is at least one cased character in the string.
11752[clinic start generated code]*/
11753
11754static PyObject *
11755unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011756/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011757{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011758 Py_ssize_t i, length;
11759 int kind;
11760 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 int cased;
11762
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011763 if (PyUnicode_READY(self) == -1)
11764 return NULL;
11765 length = PyUnicode_GET_LENGTH(self);
11766 kind = PyUnicode_KIND(self);
11767 data = PyUnicode_DATA(self);
11768
Guido van Rossumd57fd912000-03-10 22:53:23 +000011769 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011770 if (length == 1)
11771 return PyBool_FromLong(
11772 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011773
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011774 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011775 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011776 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011777
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011779 for (i = 0; i < length; i++) {
11780 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011781
Benjamin Peterson29060642009-01-31 22:14:21 +000011782 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011783 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011784 else if (!cased && Py_UNICODE_ISLOWER(ch))
11785 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011787 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011788}
11789
INADA Naoki3ae20562017-01-16 20:41:20 +090011790/*[clinic input]
11791str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792
INADA Naoki3ae20562017-01-16 20:41:20 +090011793Return True if the string is an uppercase string, False otherwise.
11794
11795A string is uppercase if all cased characters in the string are uppercase and
11796there is at least one cased character in the string.
11797[clinic start generated code]*/
11798
11799static PyObject *
11800unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011801/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011802{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 Py_ssize_t i, length;
11804 int kind;
11805 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 int cased;
11807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011808 if (PyUnicode_READY(self) == -1)
11809 return NULL;
11810 length = PyUnicode_GET_LENGTH(self);
11811 kind = PyUnicode_KIND(self);
11812 data = PyUnicode_DATA(self);
11813
Guido van Rossumd57fd912000-03-10 22:53:23 +000011814 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011815 if (length == 1)
11816 return PyBool_FromLong(
11817 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011819 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011820 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011821 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011822
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011824 for (i = 0; i < length; i++) {
11825 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011826
Benjamin Peterson29060642009-01-31 22:14:21 +000011827 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011828 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011829 else if (!cased && Py_UNICODE_ISUPPER(ch))
11830 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011832 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011833}
11834
INADA Naoki3ae20562017-01-16 20:41:20 +090011835/*[clinic input]
11836str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837
INADA Naoki3ae20562017-01-16 20:41:20 +090011838Return True if the string is a title-cased string, False otherwise.
11839
11840In a title-cased string, upper- and title-case characters may only
11841follow uncased characters and lowercase characters only cased ones.
11842[clinic start generated code]*/
11843
11844static PyObject *
11845unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011846/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011847{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011848 Py_ssize_t i, length;
11849 int kind;
11850 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 int cased, previous_is_cased;
11852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011853 if (PyUnicode_READY(self) == -1)
11854 return NULL;
11855 length = PyUnicode_GET_LENGTH(self);
11856 kind = PyUnicode_KIND(self);
11857 data = PyUnicode_DATA(self);
11858
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011860 if (length == 1) {
11861 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11862 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11863 (Py_UNICODE_ISUPPER(ch) != 0));
11864 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011866 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011867 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011868 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011869
Guido van Rossumd57fd912000-03-10 22:53:23 +000011870 cased = 0;
11871 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011872 for (i = 0; i < length; i++) {
11873 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011874
Benjamin Peterson29060642009-01-31 22:14:21 +000011875 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11876 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011877 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 previous_is_cased = 1;
11879 cased = 1;
11880 }
11881 else if (Py_UNICODE_ISLOWER(ch)) {
11882 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011883 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011884 previous_is_cased = 1;
11885 cased = 1;
11886 }
11887 else
11888 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011890 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891}
11892
INADA Naoki3ae20562017-01-16 20:41:20 +090011893/*[clinic input]
11894str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895
INADA Naoki3ae20562017-01-16 20:41:20 +090011896Return True if the string is a whitespace string, False otherwise.
11897
11898A string is whitespace if all characters in the string are whitespace and there
11899is at least one character in the string.
11900[clinic start generated code]*/
11901
11902static PyObject *
11903unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011904/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011906 Py_ssize_t i, length;
11907 int kind;
11908 void *data;
11909
11910 if (PyUnicode_READY(self) == -1)
11911 return NULL;
11912 length = PyUnicode_GET_LENGTH(self);
11913 kind = PyUnicode_KIND(self);
11914 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011915
Guido van Rossumd57fd912000-03-10 22:53:23 +000011916 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 if (length == 1)
11918 return PyBool_FromLong(
11919 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011921 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011922 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011923 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011924
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011925 for (i = 0; i < length; i++) {
11926 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011927 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011928 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011929 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011930 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011931}
11932
INADA Naoki3ae20562017-01-16 20:41:20 +090011933/*[clinic input]
11934str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011935
INADA Naoki3ae20562017-01-16 20:41:20 +090011936Return True if the string is an alphabetic string, False otherwise.
11937
11938A string is alphabetic if all characters in the string are alphabetic and there
11939is at least one character in the string.
11940[clinic start generated code]*/
11941
11942static PyObject *
11943unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011944/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011946 Py_ssize_t i, length;
11947 int kind;
11948 void *data;
11949
11950 if (PyUnicode_READY(self) == -1)
11951 return NULL;
11952 length = PyUnicode_GET_LENGTH(self);
11953 kind = PyUnicode_KIND(self);
11954 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011955
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011956 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 if (length == 1)
11958 return PyBool_FromLong(
11959 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011960
11961 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011962 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011963 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011964
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011965 for (i = 0; i < length; i++) {
11966 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011967 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011968 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011969 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011970}
11971
INADA Naoki3ae20562017-01-16 20:41:20 +090011972/*[clinic input]
11973str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011974
INADA Naoki3ae20562017-01-16 20:41:20 +090011975Return True if the string is an alpha-numeric string, False otherwise.
11976
11977A string is alpha-numeric if all characters in the string are alpha-numeric and
11978there is at least one character in the string.
11979[clinic start generated code]*/
11980
11981static PyObject *
11982unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011983/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 int kind;
11986 void *data;
11987 Py_ssize_t len, i;
11988
11989 if (PyUnicode_READY(self) == -1)
11990 return NULL;
11991
11992 kind = PyUnicode_KIND(self);
11993 data = PyUnicode_DATA(self);
11994 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011995
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011996 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (len == 1) {
11998 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11999 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12000 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012001
12002 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012003 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012004 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012005
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012006 for (i = 0; i < len; i++) {
12007 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012008 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012009 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012010 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012011 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012012}
12013
INADA Naoki3ae20562017-01-16 20:41:20 +090012014/*[clinic input]
12015str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016
INADA Naoki3ae20562017-01-16 20:41:20 +090012017Return True if the string is a decimal string, False otherwise.
12018
12019A string is a decimal string if all characters in the string are decimal and
12020there is at least one character in the string.
12021[clinic start generated code]*/
12022
12023static PyObject *
12024unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012025/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012027 Py_ssize_t i, length;
12028 int kind;
12029 void *data;
12030
12031 if (PyUnicode_READY(self) == -1)
12032 return NULL;
12033 length = PyUnicode_GET_LENGTH(self);
12034 kind = PyUnicode_KIND(self);
12035 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012036
Guido van Rossumd57fd912000-03-10 22:53:23 +000012037 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 if (length == 1)
12039 return PyBool_FromLong(
12040 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012042 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012044 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012046 for (i = 0; i < length; i++) {
12047 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012048 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012050 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012051}
12052
INADA Naoki3ae20562017-01-16 20:41:20 +090012053/*[clinic input]
12054str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055
INADA Naoki3ae20562017-01-16 20:41:20 +090012056Return True if the string is a digit string, False otherwise.
12057
12058A string is a digit string if all characters in the string are digits and there
12059is at least one character in the string.
12060[clinic start generated code]*/
12061
12062static PyObject *
12063unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012064/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012066 Py_ssize_t i, length;
12067 int kind;
12068 void *data;
12069
12070 if (PyUnicode_READY(self) == -1)
12071 return NULL;
12072 length = PyUnicode_GET_LENGTH(self);
12073 kind = PyUnicode_KIND(self);
12074 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075
Guido van Rossumd57fd912000-03-10 22:53:23 +000012076 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 if (length == 1) {
12078 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12079 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12080 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012082 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012084 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012085
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012086 for (i = 0; i < length; i++) {
12087 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012088 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012090 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012091}
12092
INADA Naoki3ae20562017-01-16 20:41:20 +090012093/*[clinic input]
12094str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095
INADA Naoki3ae20562017-01-16 20:41:20 +090012096Return True if the string is a numeric string, False otherwise.
12097
12098A string is numeric if all characters in the string are numeric and there is at
12099least one character in the string.
12100[clinic start generated code]*/
12101
12102static PyObject *
12103unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012104/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012106 Py_ssize_t i, length;
12107 int kind;
12108 void *data;
12109
12110 if (PyUnicode_READY(self) == -1)
12111 return NULL;
12112 length = PyUnicode_GET_LENGTH(self);
12113 kind = PyUnicode_KIND(self);
12114 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012115
Guido van Rossumd57fd912000-03-10 22:53:23 +000012116 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 if (length == 1)
12118 return PyBool_FromLong(
12119 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012121 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012122 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012123 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012124
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 for (i = 0; i < length; i++) {
12126 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012127 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012128 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012129 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012130}
12131
Martin v. Löwis47383402007-08-15 07:32:56 +000012132int
12133PyUnicode_IsIdentifier(PyObject *self)
12134{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 int kind;
12136 void *data;
12137 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012138 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012139
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012140 if (PyUnicode_READY(self) == -1) {
12141 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012142 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012143 }
12144
12145 /* Special case for empty strings */
12146 if (PyUnicode_GET_LENGTH(self) == 0)
12147 return 0;
12148 kind = PyUnicode_KIND(self);
12149 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012150
12151 /* PEP 3131 says that the first character must be in
12152 XID_Start and subsequent characters in XID_Continue,
12153 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012154 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012155 letters, digits, underscore). However, given the current
12156 definition of XID_Start and XID_Continue, it is sufficient
12157 to check just for these, except that _ must be allowed
12158 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012159 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012160 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012161 return 0;
12162
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012163 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012165 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012166 return 1;
12167}
12168
INADA Naoki3ae20562017-01-16 20:41:20 +090012169/*[clinic input]
12170str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012171
INADA Naoki3ae20562017-01-16 20:41:20 +090012172Return True if the string is a valid Python identifier, False otherwise.
12173
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012174Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012175such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012176[clinic start generated code]*/
12177
12178static PyObject *
12179unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012180/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012181{
12182 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12183}
12184
INADA Naoki3ae20562017-01-16 20:41:20 +090012185/*[clinic input]
12186str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012187
INADA Naoki3ae20562017-01-16 20:41:20 +090012188Return True if the string is printable, False otherwise.
12189
12190A string is printable if all of its characters are considered printable in
12191repr() or if it is empty.
12192[clinic start generated code]*/
12193
12194static PyObject *
12195unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012196/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012197{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012198 Py_ssize_t i, length;
12199 int kind;
12200 void *data;
12201
12202 if (PyUnicode_READY(self) == -1)
12203 return NULL;
12204 length = PyUnicode_GET_LENGTH(self);
12205 kind = PyUnicode_KIND(self);
12206 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012207
12208 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012209 if (length == 1)
12210 return PyBool_FromLong(
12211 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012212
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012213 for (i = 0; i < length; i++) {
12214 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012215 Py_RETURN_FALSE;
12216 }
12217 }
12218 Py_RETURN_TRUE;
12219}
12220
INADA Naoki3ae20562017-01-16 20:41:20 +090012221/*[clinic input]
12222str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223
INADA Naoki3ae20562017-01-16 20:41:20 +090012224 iterable: object
12225 /
12226
12227Concatenate any number of strings.
12228
Martin Panter91a88662017-01-24 00:30:06 +000012229The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012230The result is returned as a new string.
12231
12232Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12233[clinic start generated code]*/
12234
12235static PyObject *
12236unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012237/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238{
INADA Naoki3ae20562017-01-16 20:41:20 +090012239 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240}
12241
Martin v. Löwis18e16552006-02-15 17:27:45 +000012242static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012243unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012244{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012245 if (PyUnicode_READY(self) == -1)
12246 return -1;
12247 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248}
12249
INADA Naoki3ae20562017-01-16 20:41:20 +090012250/*[clinic input]
12251str.ljust as unicode_ljust
12252
12253 width: Py_ssize_t
12254 fillchar: Py_UCS4 = ' '
12255 /
12256
12257Return a left-justified string of length width.
12258
12259Padding is done using the specified fill character (default is a space).
12260[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012261
12262static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012263unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12264/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012266 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012267 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012268
Victor Stinnerc4b49542011-12-11 22:44:26 +010012269 if (PyUnicode_GET_LENGTH(self) >= width)
12270 return unicode_result_unchanged(self);
12271
12272 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012273}
12274
INADA Naoki3ae20562017-01-16 20:41:20 +090012275/*[clinic input]
12276str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012277
INADA Naoki3ae20562017-01-16 20:41:20 +090012278Return a copy of the string converted to lowercase.
12279[clinic start generated code]*/
12280
12281static PyObject *
12282unicode_lower_impl(PyObject *self)
12283/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012285 if (PyUnicode_READY(self) == -1)
12286 return NULL;
12287 if (PyUnicode_IS_ASCII(self))
12288 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012289 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012290}
12291
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012292#define LEFTSTRIP 0
12293#define RIGHTSTRIP 1
12294#define BOTHSTRIP 2
12295
12296/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012297static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298
INADA Naoki3ae20562017-01-16 20:41:20 +090012299#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012300
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012301/* externally visible for str.strip(unicode) */
12302PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012303_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 void *data;
12306 int kind;
12307 Py_ssize_t i, j, len;
12308 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012309 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012310
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012311 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12312 return NULL;
12313
12314 kind = PyUnicode_KIND(self);
12315 data = PyUnicode_DATA(self);
12316 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012317 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012318 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12319 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012320 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012321
Benjamin Peterson14339b62009-01-31 16:36:08 +000012322 i = 0;
12323 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012324 while (i < len) {
12325 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12326 if (!BLOOM(sepmask, ch))
12327 break;
12328 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12329 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012330 i++;
12331 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012332 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012333
Benjamin Peterson14339b62009-01-31 16:36:08 +000012334 j = len;
12335 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012336 j--;
12337 while (j >= i) {
12338 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12339 if (!BLOOM(sepmask, ch))
12340 break;
12341 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12342 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012343 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012344 }
12345
Benjamin Peterson29060642009-01-31 22:14:21 +000012346 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012347 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012348
Victor Stinner7931d9a2011-11-04 00:22:48 +010012349 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350}
12351
12352PyObject*
12353PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12354{
12355 unsigned char *data;
12356 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012357 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012358
Victor Stinnerde636f32011-10-01 03:55:54 +020012359 if (PyUnicode_READY(self) == -1)
12360 return NULL;
12361
Victor Stinner684d5fd2012-05-03 02:32:34 +020012362 length = PyUnicode_GET_LENGTH(self);
12363 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012364
Victor Stinner684d5fd2012-05-03 02:32:34 +020012365 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012366 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012367
Victor Stinnerde636f32011-10-01 03:55:54 +020012368 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012369 PyErr_SetString(PyExc_IndexError, "string index out of range");
12370 return NULL;
12371 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012372 if (start >= length || end < start)
12373 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012374
Victor Stinner684d5fd2012-05-03 02:32:34 +020012375 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012376 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012377 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012378 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012379 }
12380 else {
12381 kind = PyUnicode_KIND(self);
12382 data = PyUnicode_1BYTE_DATA(self);
12383 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012384 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012385 length);
12386 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012388
12389static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012390do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012391{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012392 Py_ssize_t len, i, j;
12393
12394 if (PyUnicode_READY(self) == -1)
12395 return NULL;
12396
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012397 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012398
Victor Stinnercc7af722013-04-09 22:39:24 +020012399 if (PyUnicode_IS_ASCII(self)) {
12400 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12401
12402 i = 0;
12403 if (striptype != RIGHTSTRIP) {
12404 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012405 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012406 if (!_Py_ascii_whitespace[ch])
12407 break;
12408 i++;
12409 }
12410 }
12411
12412 j = len;
12413 if (striptype != LEFTSTRIP) {
12414 j--;
12415 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012416 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012417 if (!_Py_ascii_whitespace[ch])
12418 break;
12419 j--;
12420 }
12421 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012422 }
12423 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012424 else {
12425 int kind = PyUnicode_KIND(self);
12426 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012427
Victor Stinnercc7af722013-04-09 22:39:24 +020012428 i = 0;
12429 if (striptype != RIGHTSTRIP) {
12430 while (i < len) {
12431 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12432 if (!Py_UNICODE_ISSPACE(ch))
12433 break;
12434 i++;
12435 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012436 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012437
12438 j = len;
12439 if (striptype != LEFTSTRIP) {
12440 j--;
12441 while (j >= i) {
12442 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12443 if (!Py_UNICODE_ISSPACE(ch))
12444 break;
12445 j--;
12446 }
12447 j++;
12448 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012449 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012450
Victor Stinner7931d9a2011-11-04 00:22:48 +010012451 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012452}
12453
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012454
12455static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012456do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012457{
Serhiy Storchakad322abb2019-09-14 13:31:50 +030012458 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 if (PyUnicode_Check(sep))
12460 return _PyUnicode_XStrip(self, striptype, sep);
12461 else {
12462 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012463 "%s arg must be None or str",
12464 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012465 return NULL;
12466 }
12467 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012468
Benjamin Peterson14339b62009-01-31 16:36:08 +000012469 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012470}
12471
12472
INADA Naoki3ae20562017-01-16 20:41:20 +090012473/*[clinic input]
12474str.strip as unicode_strip
12475
12476 chars: object = None
12477 /
12478
Miss Islington (bot)0baa6b32019-10-09 14:55:39 -070012479Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012480
12481If chars is given and not None, remove characters in chars instead.
12482[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012483
12484static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012485unicode_strip_impl(PyObject *self, PyObject *chars)
Miss Islington (bot)0baa6b32019-10-09 14:55:39 -070012486/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012487{
INADA Naoki3ae20562017-01-16 20:41:20 +090012488 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012489}
12490
12491
INADA Naoki3ae20562017-01-16 20:41:20 +090012492/*[clinic input]
12493str.lstrip as unicode_lstrip
12494
Serhiy Storchakad322abb2019-09-14 13:31:50 +030012495 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012496 /
12497
12498Return a copy of the string with leading whitespace removed.
12499
12500If chars is given and not None, remove characters in chars instead.
12501[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012502
12503static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012504unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchakad322abb2019-09-14 13:31:50 +030012505/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012506{
INADA Naoki3ae20562017-01-16 20:41:20 +090012507 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012508}
12509
12510
INADA Naoki3ae20562017-01-16 20:41:20 +090012511/*[clinic input]
12512str.rstrip as unicode_rstrip
12513
Serhiy Storchakad322abb2019-09-14 13:31:50 +030012514 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012515 /
12516
12517Return a copy of the string with trailing whitespace removed.
12518
12519If chars is given and not None, remove characters in chars instead.
12520[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012521
12522static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012523unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchakad322abb2019-09-14 13:31:50 +030012524/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012525{
INADA Naoki3ae20562017-01-16 20:41:20 +090012526 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012527}
12528
12529
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012531unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012533 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012535
Serhiy Storchaka05997252013-01-26 12:14:02 +020012536 if (len < 1)
12537 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012538
Victor Stinnerc4b49542011-12-11 22:44:26 +010012539 /* no repeat, return original string */
12540 if (len == 1)
12541 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012542
Benjamin Petersonbac79492012-01-14 13:34:47 -050012543 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012544 return NULL;
12545
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012546 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012547 PyErr_SetString(PyExc_OverflowError,
12548 "repeated string is too long");
12549 return NULL;
12550 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012551 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012552
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012553 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012554 if (!u)
12555 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012556 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012557
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012558 if (PyUnicode_GET_LENGTH(str) == 1) {
12559 const int kind = PyUnicode_KIND(str);
12560 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012561 if (kind == PyUnicode_1BYTE_KIND) {
12562 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012563 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012564 }
12565 else if (kind == PyUnicode_2BYTE_KIND) {
12566 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012567 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012568 ucs2[n] = fill_char;
12569 } else {
12570 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12571 assert(kind == PyUnicode_4BYTE_KIND);
12572 for (n = 0; n < len; ++n)
12573 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012574 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012575 }
12576 else {
12577 /* number of characters copied this far */
12578 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012579 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012580 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012581 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012582 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012583 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012584 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012585 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012586 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012587 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012588 }
12589
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012590 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012591 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012592}
12593
Alexander Belopolsky40018472011-02-26 01:02:56 +000012594PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012595PyUnicode_Replace(PyObject *str,
12596 PyObject *substr,
12597 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012598 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012599{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012600 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12601 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012602 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012603 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012604}
12605
INADA Naoki3ae20562017-01-16 20:41:20 +090012606/*[clinic input]
12607str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012608
INADA Naoki3ae20562017-01-16 20:41:20 +090012609 old: unicode
12610 new: unicode
12611 count: Py_ssize_t = -1
12612 Maximum number of occurrences to replace.
12613 -1 (the default value) means replace all occurrences.
12614 /
12615
12616Return a copy with all occurrences of substring old replaced by new.
12617
12618If the optional argument count is given, only the first count occurrences are
12619replaced.
12620[clinic start generated code]*/
12621
12622static PyObject *
12623unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12624 Py_ssize_t count)
12625/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012627 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012628 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012629 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012630}
12631
Alexander Belopolsky40018472011-02-26 01:02:56 +000012632static PyObject *
12633unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012634{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012635 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 Py_ssize_t isize;
12637 Py_ssize_t osize, squote, dquote, i, o;
12638 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012639 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012643 return NULL;
12644
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 isize = PyUnicode_GET_LENGTH(unicode);
12646 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012647
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012648 /* Compute length of output, quote characters, and
12649 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012650 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 max = 127;
12652 squote = dquote = 0;
12653 ikind = PyUnicode_KIND(unicode);
12654 for (i = 0; i < isize; i++) {
12655 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012656 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012658 case '\'': squote++; break;
12659 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012661 incr = 2;
12662 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 default:
12664 /* Fast-path ASCII */
12665 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012666 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012668 ;
12669 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012670 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012672 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012673 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012674 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012675 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012676 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012677 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012678 if (osize > PY_SSIZE_T_MAX - incr) {
12679 PyErr_SetString(PyExc_OverflowError,
12680 "string is too long to generate repr");
12681 return NULL;
12682 }
12683 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 }
12685
12686 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012687 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012689 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 if (dquote)
12691 /* Both squote and dquote present. Use squote,
12692 and escape them */
12693 osize += squote;
12694 else
12695 quote = '"';
12696 }
Victor Stinner55c08782013-04-14 18:45:39 +020012697 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012698
12699 repr = PyUnicode_New(osize, max);
12700 if (repr == NULL)
12701 return NULL;
12702 okind = PyUnicode_KIND(repr);
12703 odata = PyUnicode_DATA(repr);
12704
12705 PyUnicode_WRITE(okind, odata, 0, quote);
12706 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012707 if (unchanged) {
12708 _PyUnicode_FastCopyCharacters(repr, 1,
12709 unicode, 0,
12710 isize);
12711 }
12712 else {
12713 for (i = 0, o = 1; i < isize; i++) {
12714 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012715
Victor Stinner55c08782013-04-14 18:45:39 +020012716 /* Escape quotes and backslashes */
12717 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012718 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012719 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012720 continue;
12721 }
12722
12723 /* Map special whitespace to '\t', \n', '\r' */
12724 if (ch == '\t') {
12725 PyUnicode_WRITE(okind, odata, o++, '\\');
12726 PyUnicode_WRITE(okind, odata, o++, 't');
12727 }
12728 else if (ch == '\n') {
12729 PyUnicode_WRITE(okind, odata, o++, '\\');
12730 PyUnicode_WRITE(okind, odata, o++, 'n');
12731 }
12732 else if (ch == '\r') {
12733 PyUnicode_WRITE(okind, odata, o++, '\\');
12734 PyUnicode_WRITE(okind, odata, o++, 'r');
12735 }
12736
12737 /* Map non-printable US ASCII to '\xhh' */
12738 else if (ch < ' ' || ch == 0x7F) {
12739 PyUnicode_WRITE(okind, odata, o++, '\\');
12740 PyUnicode_WRITE(okind, odata, o++, 'x');
12741 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12742 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12743 }
12744
12745 /* Copy ASCII characters as-is */
12746 else if (ch < 0x7F) {
12747 PyUnicode_WRITE(okind, odata, o++, ch);
12748 }
12749
12750 /* Non-ASCII characters */
12751 else {
12752 /* Map Unicode whitespace and control characters
12753 (categories Z* and C* except ASCII space)
12754 */
12755 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12756 PyUnicode_WRITE(okind, odata, o++, '\\');
12757 /* Map 8-bit characters to '\xhh' */
12758 if (ch <= 0xff) {
12759 PyUnicode_WRITE(okind, odata, o++, 'x');
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12762 }
12763 /* Map 16-bit characters to '\uxxxx' */
12764 else if (ch <= 0xffff) {
12765 PyUnicode_WRITE(okind, odata, o++, 'u');
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12770 }
12771 /* Map 21-bit characters to '\U00xxxxxx' */
12772 else {
12773 PyUnicode_WRITE(okind, odata, o++, 'U');
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12776 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12777 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12778 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12779 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12780 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12781 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12782 }
12783 }
12784 /* Copy characters as-is */
12785 else {
12786 PyUnicode_WRITE(okind, odata, o++, ch);
12787 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012788 }
12789 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012790 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012791 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012792 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012793 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794}
12795
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012796PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012797 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798\n\
12799Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012800such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801arguments start and end are interpreted as in slice notation.\n\
12802\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012803Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012804
12805static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012806unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012808 /* initialize variables to prevent gcc warning */
12809 PyObject *substring = NULL;
12810 Py_ssize_t start = 0;
12811 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012812 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012813
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012814 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012815 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012817 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012818 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012819
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012820 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012821
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 if (result == -2)
12823 return NULL;
12824
Christian Heimes217cfd12007-12-02 14:31:20 +000012825 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826}
12827
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012828PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012829 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012831Return the highest index in S where substring sub is found,\n\
12832such that sub is contained within S[start:end]. Optional\n\
12833arguments start and end are interpreted as in slice notation.\n\
12834\n\
12835Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012836
12837static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012838unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012840 /* initialize variables to prevent gcc warning */
12841 PyObject *substring = NULL;
12842 Py_ssize_t start = 0;
12843 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012844 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012845
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012846 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012847 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012848
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012849 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012850 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012852 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012853
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012854 if (result == -2)
12855 return NULL;
12856
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857 if (result < 0) {
12858 PyErr_SetString(PyExc_ValueError, "substring not found");
12859 return NULL;
12860 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012861
Christian Heimes217cfd12007-12-02 14:31:20 +000012862 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012863}
12864
INADA Naoki3ae20562017-01-16 20:41:20 +090012865/*[clinic input]
12866str.rjust as unicode_rjust
12867
12868 width: Py_ssize_t
12869 fillchar: Py_UCS4 = ' '
12870 /
12871
12872Return a right-justified string of length width.
12873
12874Padding is done using the specified fill character (default is a space).
12875[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876
12877static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012878unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12879/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012881 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882 return NULL;
12883
Victor Stinnerc4b49542011-12-11 22:44:26 +010012884 if (PyUnicode_GET_LENGTH(self) >= width)
12885 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886
Victor Stinnerc4b49542011-12-11 22:44:26 +010012887 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012888}
12889
Alexander Belopolsky40018472011-02-26 01:02:56 +000012890PyObject *
12891PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012892{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012893 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012894 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012896 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012897}
12898
INADA Naoki3ae20562017-01-16 20:41:20 +090012899/*[clinic input]
12900str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012901
INADA Naoki3ae20562017-01-16 20:41:20 +090012902 sep: object = None
12903 The delimiter according which to split the string.
12904 None (the default value) means split according to any whitespace,
12905 and discard empty strings from the result.
12906 maxsplit: Py_ssize_t = -1
12907 Maximum number of splits to do.
12908 -1 (the default value) means no limit.
12909
12910Return a list of the words in the string, using sep as the delimiter string.
12911[clinic start generated code]*/
12912
12913static PyObject *
12914unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12915/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916{
INADA Naoki3ae20562017-01-16 20:41:20 +090012917 if (sep == Py_None)
12918 return split(self, NULL, maxsplit);
12919 if (PyUnicode_Check(sep))
12920 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012921
Victor Stinner998b8062018-09-12 00:23:25 +020012922 PyErr_Format(PyExc_TypeError,
12923 "must be str or None, not %.100s",
12924 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012925 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926}
12927
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012929PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012931 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012932 int kind1, kind2;
12933 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012935
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012936 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012937 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012938
Victor Stinner14f8f022011-10-05 20:58:25 +020012939 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012940 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012941 len1 = PyUnicode_GET_LENGTH(str_obj);
12942 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012943 if (kind1 < kind2 || len1 < len2) {
12944 _Py_INCREF_UNICODE_EMPTY();
12945 if (!unicode_empty)
12946 out = NULL;
12947 else {
12948 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12949 Py_DECREF(unicode_empty);
12950 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012951 return out;
12952 }
12953 buf1 = PyUnicode_DATA(str_obj);
12954 buf2 = PyUnicode_DATA(sep_obj);
12955 if (kind2 != kind1) {
12956 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12957 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012958 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012959 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012960
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012961 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012962 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012963 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12964 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965 else
12966 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012967 break;
12968 case PyUnicode_2BYTE_KIND:
12969 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12970 break;
12971 case PyUnicode_4BYTE_KIND:
12972 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12973 break;
12974 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012975 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012976 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012977
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012978 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012979 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012980
12981 return out;
12982}
12983
12984
12985PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012986PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012988 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012989 int kind1, kind2;
12990 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012992
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012993 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012994 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012995
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012996 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012997 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012998 len1 = PyUnicode_GET_LENGTH(str_obj);
12999 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013000 if (kind1 < kind2 || len1 < len2) {
13001 _Py_INCREF_UNICODE_EMPTY();
13002 if (!unicode_empty)
13003 out = NULL;
13004 else {
13005 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13006 Py_DECREF(unicode_empty);
13007 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 return out;
13009 }
13010 buf1 = PyUnicode_DATA(str_obj);
13011 buf2 = PyUnicode_DATA(sep_obj);
13012 if (kind2 != kind1) {
13013 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13014 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013015 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013016 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013017
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013018 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013019 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013020 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13021 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022 else
13023 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013024 break;
13025 case PyUnicode_2BYTE_KIND:
13026 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13027 break;
13028 case PyUnicode_4BYTE_KIND:
13029 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13030 break;
13031 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013032 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013033 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013034
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013035 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013036 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013037
13038 return out;
13039}
13040
INADA Naoki3ae20562017-01-16 20:41:20 +090013041/*[clinic input]
13042str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013043
INADA Naoki3ae20562017-01-16 20:41:20 +090013044 sep: object
13045 /
13046
13047Partition the string into three parts using the given separator.
13048
13049This will search for the separator in the string. If the separator is found,
13050returns a 3-tuple containing the part before the separator, the separator
13051itself, and the part after it.
13052
13053If the separator is not found, returns a 3-tuple containing the original string
13054and two empty strings.
13055[clinic start generated code]*/
13056
13057static PyObject *
13058unicode_partition(PyObject *self, PyObject *sep)
13059/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060{
INADA Naoki3ae20562017-01-16 20:41:20 +090013061 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062}
13063
INADA Naoki3ae20562017-01-16 20:41:20 +090013064/*[clinic input]
13065str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013066
INADA Naoki3ae20562017-01-16 20:41:20 +090013067Partition the string into three parts using the given separator.
13068
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013069This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013070the separator is found, returns a 3-tuple containing the part before the
13071separator, the separator itself, and the part after it.
13072
13073If the separator is not found, returns a 3-tuple containing two empty strings
13074and the original string.
13075[clinic start generated code]*/
13076
13077static PyObject *
13078unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013079/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013080{
INADA Naoki3ae20562017-01-16 20:41:20 +090013081 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013082}
13083
Alexander Belopolsky40018472011-02-26 01:02:56 +000013084PyObject *
13085PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013086{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013087 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013088 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013089
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013090 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013091}
13092
INADA Naoki3ae20562017-01-16 20:41:20 +090013093/*[clinic input]
13094str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013095
INADA Naoki3ae20562017-01-16 20:41:20 +090013096Return a list of the words in the string, using sep as the delimiter string.
13097
13098Splits are done starting at the end of the string and working to the front.
13099[clinic start generated code]*/
13100
13101static PyObject *
13102unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13103/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013104{
INADA Naoki3ae20562017-01-16 20:41:20 +090013105 if (sep == Py_None)
13106 return rsplit(self, NULL, maxsplit);
13107 if (PyUnicode_Check(sep))
13108 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013109
Victor Stinner998b8062018-09-12 00:23:25 +020013110 PyErr_Format(PyExc_TypeError,
13111 "must be str or None, not %.100s",
13112 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013113 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013114}
13115
INADA Naoki3ae20562017-01-16 20:41:20 +090013116/*[clinic input]
13117str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013118
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013119 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013120
13121Return a list of the lines in the string, breaking at line boundaries.
13122
13123Line breaks are not included in the resulting list unless keepends is given and
13124true.
13125[clinic start generated code]*/
13126
13127static PyObject *
13128unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013129/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013131 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132}
13133
13134static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013135PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013137 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013138}
13139
INADA Naoki3ae20562017-01-16 20:41:20 +090013140/*[clinic input]
13141str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013142
INADA Naoki3ae20562017-01-16 20:41:20 +090013143Convert uppercase characters to lowercase and lowercase characters to uppercase.
13144[clinic start generated code]*/
13145
13146static PyObject *
13147unicode_swapcase_impl(PyObject *self)
13148/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013149{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013150 if (PyUnicode_READY(self) == -1)
13151 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013152 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013153}
13154
Larry Hastings61272b72014-01-07 12:41:53 -080013155/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013156
Larry Hastings31826802013-10-19 00:09:25 -070013157@staticmethod
13158str.maketrans as unicode_maketrans
13159
13160 x: object
13161
13162 y: unicode=NULL
13163
13164 z: unicode=NULL
13165
13166 /
13167
13168Return a translation table usable for str.translate().
13169
13170If there is only one argument, it must be a dictionary mapping Unicode
13171ordinals (integers) or characters to Unicode ordinals, strings or None.
13172Character keys will be then converted to ordinals.
13173If there are two arguments, they must be strings of equal length, and
13174in the resulting dictionary, each character in x will be mapped to the
13175character at the same position in y. If there is a third argument, it
13176must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013177[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013178
Larry Hastings31826802013-10-19 00:09:25 -070013179static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013180unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013181/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013182{
Georg Brandlceee0772007-11-27 23:48:05 +000013183 PyObject *new = NULL, *key, *value;
13184 Py_ssize_t i = 0;
13185 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013186
Georg Brandlceee0772007-11-27 23:48:05 +000013187 new = PyDict_New();
13188 if (!new)
13189 return NULL;
13190 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013191 int x_kind, y_kind, z_kind;
13192 void *x_data, *y_data, *z_data;
13193
Georg Brandlceee0772007-11-27 23:48:05 +000013194 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013195 if (!PyUnicode_Check(x)) {
13196 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13197 "be a string if there is a second argument");
13198 goto err;
13199 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013201 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13202 "arguments must have equal length");
13203 goto err;
13204 }
13205 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013206 x_kind = PyUnicode_KIND(x);
13207 y_kind = PyUnicode_KIND(y);
13208 x_data = PyUnicode_DATA(x);
13209 y_data = PyUnicode_DATA(y);
13210 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13211 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013212 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013213 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013214 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013215 if (!value) {
13216 Py_DECREF(key);
13217 goto err;
13218 }
Georg Brandlceee0772007-11-27 23:48:05 +000013219 res = PyDict_SetItem(new, key, value);
13220 Py_DECREF(key);
13221 Py_DECREF(value);
13222 if (res < 0)
13223 goto err;
13224 }
13225 /* create entries for deleting chars in z */
13226 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013227 z_kind = PyUnicode_KIND(z);
13228 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013229 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013231 if (!key)
13232 goto err;
13233 res = PyDict_SetItem(new, key, Py_None);
13234 Py_DECREF(key);
13235 if (res < 0)
13236 goto err;
13237 }
13238 }
13239 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013240 int kind;
13241 void *data;
13242
Georg Brandlceee0772007-11-27 23:48:05 +000013243 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013244 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013245 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13246 "to maketrans it must be a dict");
13247 goto err;
13248 }
13249 /* copy entries into the new dict, converting string keys to int keys */
13250 while (PyDict_Next(x, &i, &key, &value)) {
13251 if (PyUnicode_Check(key)) {
13252 /* convert string keys to integer keys */
13253 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013254 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013255 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13256 "table must be of length 1");
13257 goto err;
13258 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013259 kind = PyUnicode_KIND(key);
13260 data = PyUnicode_DATA(key);
13261 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013262 if (!newkey)
13263 goto err;
13264 res = PyDict_SetItem(new, newkey, value);
13265 Py_DECREF(newkey);
13266 if (res < 0)
13267 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013268 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013269 /* just keep integer keys */
13270 if (PyDict_SetItem(new, key, value) < 0)
13271 goto err;
13272 } else {
13273 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13274 "be strings or integers");
13275 goto err;
13276 }
13277 }
13278 }
13279 return new;
13280 err:
13281 Py_DECREF(new);
13282 return NULL;
13283}
13284
INADA Naoki3ae20562017-01-16 20:41:20 +090013285/*[clinic input]
13286str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013287
INADA Naoki3ae20562017-01-16 20:41:20 +090013288 table: object
13289 Translation table, which must be a mapping of Unicode ordinals to
13290 Unicode ordinals, strings, or None.
13291 /
13292
13293Replace each character in the string using the given translation table.
13294
13295The table must implement lookup/indexing via __getitem__, for instance a
13296dictionary or list. If this operation raises LookupError, the character is
13297left untouched. Characters mapped to None are deleted.
13298[clinic start generated code]*/
13299
13300static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013301unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013302/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013304 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013305}
13306
INADA Naoki3ae20562017-01-16 20:41:20 +090013307/*[clinic input]
13308str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013309
INADA Naoki3ae20562017-01-16 20:41:20 +090013310Return a copy of the string converted to uppercase.
13311[clinic start generated code]*/
13312
13313static PyObject *
13314unicode_upper_impl(PyObject *self)
13315/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013317 if (PyUnicode_READY(self) == -1)
13318 return NULL;
13319 if (PyUnicode_IS_ASCII(self))
13320 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013321 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013322}
13323
INADA Naoki3ae20562017-01-16 20:41:20 +090013324/*[clinic input]
13325str.zfill as unicode_zfill
13326
13327 width: Py_ssize_t
13328 /
13329
13330Pad a numeric string with zeros on the left, to fill a field of the given width.
13331
13332The string is never truncated.
13333[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013334
13335static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013336unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013337/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013338{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013339 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013340 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013341 int kind;
13342 void *data;
13343 Py_UCS4 chr;
13344
Benjamin Petersonbac79492012-01-14 13:34:47 -050013345 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013346 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013347
Victor Stinnerc4b49542011-12-11 22:44:26 +010013348 if (PyUnicode_GET_LENGTH(self) >= width)
13349 return unicode_result_unchanged(self);
13350
13351 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013352
13353 u = pad(self, fill, 0, '0');
13354
Walter Dörwald068325e2002-04-15 13:36:47 +000013355 if (u == NULL)
13356 return NULL;
13357
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 kind = PyUnicode_KIND(u);
13359 data = PyUnicode_DATA(u);
13360 chr = PyUnicode_READ(kind, data, fill);
13361
13362 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013364 PyUnicode_WRITE(kind, data, 0, chr);
13365 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013366 }
13367
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013368 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013369 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013371
13372#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013373static PyObject *
13374unicode__decimal2ascii(PyObject *self)
13375{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013376 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013377}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013378#endif
13379
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013380PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013381 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013382\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013383Return True if S starts with the specified prefix, False otherwise.\n\
13384With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013385With optional end, stop comparing S at that position.\n\
13386prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387
13388static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013389unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013390 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013392 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013393 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013394 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013395 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013396 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013397
Jesus Ceaac451502011-04-20 17:09:23 +020013398 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013399 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013400 if (PyTuple_Check(subobj)) {
13401 Py_ssize_t i;
13402 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013403 substring = PyTuple_GET_ITEM(subobj, i);
13404 if (!PyUnicode_Check(substring)) {
13405 PyErr_Format(PyExc_TypeError,
13406 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013407 "not %.100s",
13408 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013409 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013410 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013411 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013412 if (result == -1)
13413 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013414 if (result) {
13415 Py_RETURN_TRUE;
13416 }
13417 }
13418 /* nothing matched */
13419 Py_RETURN_FALSE;
13420 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013421 if (!PyUnicode_Check(subobj)) {
13422 PyErr_Format(PyExc_TypeError,
13423 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013424 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013425 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013426 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013427 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013428 if (result == -1)
13429 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013430 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431}
13432
13433
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013434PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013435 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013436\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013437Return True if S ends with the specified suffix, False otherwise.\n\
13438With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013439With optional end, stop comparing S at that position.\n\
13440suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441
13442static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013443unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013444 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013446 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013447 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013448 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013449 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013450 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013451
Jesus Ceaac451502011-04-20 17:09:23 +020013452 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013454 if (PyTuple_Check(subobj)) {
13455 Py_ssize_t i;
13456 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013457 substring = PyTuple_GET_ITEM(subobj, i);
13458 if (!PyUnicode_Check(substring)) {
13459 PyErr_Format(PyExc_TypeError,
13460 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013461 "not %.100s",
13462 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013463 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013464 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013465 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013466 if (result == -1)
13467 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013468 if (result) {
13469 Py_RETURN_TRUE;
13470 }
13471 }
13472 Py_RETURN_FALSE;
13473 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013474 if (!PyUnicode_Check(subobj)) {
13475 PyErr_Format(PyExc_TypeError,
13476 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013477 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013478 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013479 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013480 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013481 if (result == -1)
13482 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013483 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013484}
13485
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013486static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013487_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013488{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013489 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13490 writer->data = PyUnicode_DATA(writer->buffer);
13491
13492 if (!writer->readonly) {
13493 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013494 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013495 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013496 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013497 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13498 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13499 writer->kind = PyUnicode_WCHAR_KIND;
13500 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13501
Victor Stinner8f674cc2013-04-17 23:02:17 +020013502 /* Copy-on-write mode: set buffer size to 0 so
13503 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13504 * next write. */
13505 writer->size = 0;
13506 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013507}
13508
Victor Stinnerd3f08822012-05-29 12:57:52 +020013509void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013510_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013511{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013512 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013513
13514 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013515 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013516
13517 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13518 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13519 writer->kind = PyUnicode_WCHAR_KIND;
13520 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013521}
13522
Victor Stinnerd3f08822012-05-29 12:57:52 +020013523int
13524_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13525 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013526{
13527 Py_ssize_t newlen;
13528 PyObject *newbuffer;
13529
Victor Stinner2740e462016-09-06 16:58:36 -070013530 assert(maxchar <= MAX_UNICODE);
13531
Victor Stinnerca9381e2015-09-22 00:58:32 +020013532 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013533 assert((maxchar > writer->maxchar && length >= 0)
13534 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013535
Victor Stinner202fdca2012-05-07 12:47:02 +020013536 if (length > PY_SSIZE_T_MAX - writer->pos) {
13537 PyErr_NoMemory();
13538 return -1;
13539 }
13540 newlen = writer->pos + length;
13541
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013542 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013543
Victor Stinnerd3f08822012-05-29 12:57:52 +020013544 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013545 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013546 if (writer->overallocate
13547 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13548 /* overallocate to limit the number of realloc() */
13549 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013550 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013551 if (newlen < writer->min_length)
13552 newlen = writer->min_length;
13553
Victor Stinnerd3f08822012-05-29 12:57:52 +020013554 writer->buffer = PyUnicode_New(newlen, maxchar);
13555 if (writer->buffer == NULL)
13556 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013558 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013559 if (writer->overallocate
13560 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13561 /* overallocate to limit the number of realloc() */
13562 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013564 if (newlen < writer->min_length)
13565 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013566
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013567 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013568 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013569 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013570 newbuffer = PyUnicode_New(newlen, maxchar);
13571 if (newbuffer == NULL)
13572 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013573 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13574 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013575 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013576 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013577 }
13578 else {
13579 newbuffer = resize_compact(writer->buffer, newlen);
13580 if (newbuffer == NULL)
13581 return -1;
13582 }
13583 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013584 }
13585 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013586 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013587 newbuffer = PyUnicode_New(writer->size, maxchar);
13588 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013589 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013590 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13591 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013592 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013593 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013594 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013595 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013596
13597#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013598}
13599
Victor Stinnerca9381e2015-09-22 00:58:32 +020013600int
13601_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13602 enum PyUnicode_Kind kind)
13603{
13604 Py_UCS4 maxchar;
13605
13606 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13607 assert(writer->kind < kind);
13608
13609 switch (kind)
13610 {
13611 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13612 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13613 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13614 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013615 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013616 }
13617
13618 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13619}
13620
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013621static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013622_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013623{
Victor Stinner2740e462016-09-06 16:58:36 -070013624 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013625 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13626 return -1;
13627 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13628 writer->pos++;
13629 return 0;
13630}
13631
13632int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013633_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13634{
13635 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13636}
13637
13638int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013639_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13640{
13641 Py_UCS4 maxchar;
13642 Py_ssize_t len;
13643
13644 if (PyUnicode_READY(str) == -1)
13645 return -1;
13646 len = PyUnicode_GET_LENGTH(str);
13647 if (len == 0)
13648 return 0;
13649 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13650 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013651 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013652 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013653 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013654 Py_INCREF(str);
13655 writer->buffer = str;
13656 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013657 writer->pos += len;
13658 return 0;
13659 }
13660 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13661 return -1;
13662 }
13663 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13664 str, 0, len);
13665 writer->pos += len;
13666 return 0;
13667}
13668
Victor Stinnere215d962012-10-06 23:03:36 +020013669int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013670_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13671 Py_ssize_t start, Py_ssize_t end)
13672{
13673 Py_UCS4 maxchar;
13674 Py_ssize_t len;
13675
13676 if (PyUnicode_READY(str) == -1)
13677 return -1;
13678
13679 assert(0 <= start);
13680 assert(end <= PyUnicode_GET_LENGTH(str));
13681 assert(start <= end);
13682
13683 if (end == 0)
13684 return 0;
13685
13686 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13687 return _PyUnicodeWriter_WriteStr(writer, str);
13688
13689 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13690 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13691 else
13692 maxchar = writer->maxchar;
13693 len = end - start;
13694
13695 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13696 return -1;
13697
13698 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13699 str, start, len);
13700 writer->pos += len;
13701 return 0;
13702}
13703
13704int
Victor Stinner4a587072013-11-19 12:54:53 +010013705_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13706 const char *ascii, Py_ssize_t len)
13707{
13708 if (len == -1)
13709 len = strlen(ascii);
13710
13711 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13712
13713 if (writer->buffer == NULL && !writer->overallocate) {
13714 PyObject *str;
13715
13716 str = _PyUnicode_FromASCII(ascii, len);
13717 if (str == NULL)
13718 return -1;
13719
13720 writer->readonly = 1;
13721 writer->buffer = str;
13722 _PyUnicodeWriter_Update(writer);
13723 writer->pos += len;
13724 return 0;
13725 }
13726
13727 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13728 return -1;
13729
13730 switch (writer->kind)
13731 {
13732 case PyUnicode_1BYTE_KIND:
13733 {
13734 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13735 Py_UCS1 *data = writer->data;
13736
Christian Heimesf051e432016-09-13 20:22:02 +020013737 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013738 break;
13739 }
13740 case PyUnicode_2BYTE_KIND:
13741 {
13742 _PyUnicode_CONVERT_BYTES(
13743 Py_UCS1, Py_UCS2,
13744 ascii, ascii + len,
13745 (Py_UCS2 *)writer->data + writer->pos);
13746 break;
13747 }
13748 case PyUnicode_4BYTE_KIND:
13749 {
13750 _PyUnicode_CONVERT_BYTES(
13751 Py_UCS1, Py_UCS4,
13752 ascii, ascii + len,
13753 (Py_UCS4 *)writer->data + writer->pos);
13754 break;
13755 }
13756 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013757 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013758 }
13759
13760 writer->pos += len;
13761 return 0;
13762}
13763
13764int
13765_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13766 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013767{
13768 Py_UCS4 maxchar;
13769
13770 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13771 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13772 return -1;
13773 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13774 writer->pos += len;
13775 return 0;
13776}
13777
Victor Stinnerd3f08822012-05-29 12:57:52 +020013778PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013779_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013780{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013781 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013782
Victor Stinnerd3f08822012-05-29 12:57:52 +020013783 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013784 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013785 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013786 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013787
13788 str = writer->buffer;
13789 writer->buffer = NULL;
13790
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013791 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013792 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13793 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013794 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013795
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013796 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13797 PyObject *str2;
13798 str2 = resize_compact(str, writer->pos);
13799 if (str2 == NULL) {
13800 Py_DECREF(str);
13801 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013802 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013803 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013804 }
13805
Victor Stinner15a0bd32013-07-08 22:29:55 +020013806 assert(_PyUnicode_CheckConsistency(str, 1));
13807 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013808}
13809
Victor Stinnerd3f08822012-05-29 12:57:52 +020013810void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013811_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013812{
13813 Py_CLEAR(writer->buffer);
13814}
13815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013816#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013817
13818PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013819 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013820\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013821Return a formatted version of S, using substitutions from args and kwargs.\n\
13822The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013823
Eric Smith27bbca62010-11-04 17:06:58 +000013824PyDoc_STRVAR(format_map__doc__,
13825 "S.format_map(mapping) -> str\n\
13826\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013827Return a formatted version of S, using substitutions from mapping.\n\
13828The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013829
INADA Naoki3ae20562017-01-16 20:41:20 +090013830/*[clinic input]
13831str.__format__ as unicode___format__
13832
13833 format_spec: unicode
13834 /
13835
13836Return a formatted version of the string as described by format_spec.
13837[clinic start generated code]*/
13838
Eric Smith4a7d76d2008-05-30 18:10:19 +000013839static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013840unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013841/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013842{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013843 _PyUnicodeWriter writer;
13844 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013845
Victor Stinnerd3f08822012-05-29 12:57:52 +020013846 if (PyUnicode_READY(self) == -1)
13847 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013848 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013849 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13850 self, format_spec, 0,
13851 PyUnicode_GET_LENGTH(format_spec));
13852 if (ret == -1) {
13853 _PyUnicodeWriter_Dealloc(&writer);
13854 return NULL;
13855 }
13856 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013857}
13858
INADA Naoki3ae20562017-01-16 20:41:20 +090013859/*[clinic input]
13860str.__sizeof__ as unicode_sizeof
13861
13862Return the size of the string in memory, in bytes.
13863[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013864
13865static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013866unicode_sizeof_impl(PyObject *self)
13867/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013868{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013869 Py_ssize_t size;
13870
13871 /* If it's a compact object, account for base structure +
13872 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013873 if (PyUnicode_IS_COMPACT_ASCII(self))
13874 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13875 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013876 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013877 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013878 else {
13879 /* If it is a two-block object, account for base object, and
13880 for character block if present. */
13881 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013882 if (_PyUnicode_DATA_ANY(self))
13883 size += (PyUnicode_GET_LENGTH(self) + 1) *
13884 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013885 }
13886 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013887 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013888 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13889 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13890 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13891 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892
13893 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013894}
13895
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013896static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013897unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013898{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013899 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013900 if (!copy)
13901 return NULL;
13902 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013903}
13904
Guido van Rossumd57fd912000-03-10 22:53:23 +000013905static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013906 UNICODE_ENCODE_METHODDEF
13907 UNICODE_REPLACE_METHODDEF
13908 UNICODE_SPLIT_METHODDEF
13909 UNICODE_RSPLIT_METHODDEF
13910 UNICODE_JOIN_METHODDEF
13911 UNICODE_CAPITALIZE_METHODDEF
13912 UNICODE_CASEFOLD_METHODDEF
13913 UNICODE_TITLE_METHODDEF
13914 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013915 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013916 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013917 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013918 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013919 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013920 UNICODE_LJUST_METHODDEF
13921 UNICODE_LOWER_METHODDEF
13922 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013923 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13924 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013925 UNICODE_RJUST_METHODDEF
13926 UNICODE_RSTRIP_METHODDEF
13927 UNICODE_RPARTITION_METHODDEF
13928 UNICODE_SPLITLINES_METHODDEF
13929 UNICODE_STRIP_METHODDEF
13930 UNICODE_SWAPCASE_METHODDEF
13931 UNICODE_TRANSLATE_METHODDEF
13932 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013933 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13934 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013935 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013936 UNICODE_ISLOWER_METHODDEF
13937 UNICODE_ISUPPER_METHODDEF
13938 UNICODE_ISTITLE_METHODDEF
13939 UNICODE_ISSPACE_METHODDEF
13940 UNICODE_ISDECIMAL_METHODDEF
13941 UNICODE_ISDIGIT_METHODDEF
13942 UNICODE_ISNUMERIC_METHODDEF
13943 UNICODE_ISALPHA_METHODDEF
13944 UNICODE_ISALNUM_METHODDEF
13945 UNICODE_ISIDENTIFIER_METHODDEF
13946 UNICODE_ISPRINTABLE_METHODDEF
13947 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013948 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013949 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013950 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013951 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013952 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013953#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013954 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013955 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013956#endif
13957
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013958 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013959 {NULL, NULL}
13960};
13961
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013962static PyObject *
13963unicode_mod(PyObject *v, PyObject *w)
13964{
Brian Curtindfc80e32011-08-10 20:28:54 -050013965 if (!PyUnicode_Check(v))
13966 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013967 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013968}
13969
13970static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013971 0, /*nb_add*/
13972 0, /*nb_subtract*/
13973 0, /*nb_multiply*/
13974 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013975};
13976
Guido van Rossumd57fd912000-03-10 22:53:23 +000013977static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013978 (lenfunc) unicode_length, /* sq_length */
13979 PyUnicode_Concat, /* sq_concat */
13980 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13981 (ssizeargfunc) unicode_getitem, /* sq_item */
13982 0, /* sq_slice */
13983 0, /* sq_ass_item */
13984 0, /* sq_ass_slice */
13985 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013986};
13987
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013988static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013989unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013990{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013991 if (PyUnicode_READY(self) == -1)
13992 return NULL;
13993
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013994 if (PyIndex_Check(item)) {
13995 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013996 if (i == -1 && PyErr_Occurred())
13997 return NULL;
13998 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013999 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014000 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014001 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014002 Py_ssize_t start, stop, step, slicelength, i;
14003 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014004 PyObject *result;
14005 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014006 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014007 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014008
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014009 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014010 return NULL;
14011 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014012 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14013 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014014
14015 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014016 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014017 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014018 slicelength == PyUnicode_GET_LENGTH(self)) {
14019 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014020 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014021 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014022 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014023 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014024 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014025 src_kind = PyUnicode_KIND(self);
14026 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014027 if (!PyUnicode_IS_ASCII(self)) {
14028 kind_limit = kind_maxchar_limit(src_kind);
14029 max_char = 0;
14030 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14031 ch = PyUnicode_READ(src_kind, src_data, cur);
14032 if (ch > max_char) {
14033 max_char = ch;
14034 if (max_char >= kind_limit)
14035 break;
14036 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014037 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014038 }
Victor Stinner55c99112011-10-13 01:17:06 +020014039 else
14040 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014041 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014042 if (result == NULL)
14043 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014044 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014045 dest_data = PyUnicode_DATA(result);
14046
14047 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014048 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14049 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014050 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014051 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014052 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014053 } else {
14054 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14055 return NULL;
14056 }
14057}
14058
14059static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014060 (lenfunc)unicode_length, /* mp_length */
14061 (binaryfunc)unicode_subscript, /* mp_subscript */
14062 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014063};
14064
Guido van Rossumd57fd912000-03-10 22:53:23 +000014065
Guido van Rossumd57fd912000-03-10 22:53:23 +000014066/* Helpers for PyUnicode_Format() */
14067
Victor Stinnera47082312012-10-04 02:19:54 +020014068struct unicode_formatter_t {
14069 PyObject *args;
14070 int args_owned;
14071 Py_ssize_t arglen, argidx;
14072 PyObject *dict;
14073
14074 enum PyUnicode_Kind fmtkind;
14075 Py_ssize_t fmtcnt, fmtpos;
14076 void *fmtdata;
14077 PyObject *fmtstr;
14078
14079 _PyUnicodeWriter writer;
14080};
14081
14082struct unicode_format_arg_t {
14083 Py_UCS4 ch;
14084 int flags;
14085 Py_ssize_t width;
14086 int prec;
14087 int sign;
14088};
14089
Guido van Rossumd57fd912000-03-10 22:53:23 +000014090static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014091unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014092{
Victor Stinnera47082312012-10-04 02:19:54 +020014093 Py_ssize_t argidx = ctx->argidx;
14094
14095 if (argidx < ctx->arglen) {
14096 ctx->argidx++;
14097 if (ctx->arglen < 0)
14098 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014099 else
Victor Stinnera47082312012-10-04 02:19:54 +020014100 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014101 }
14102 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014103 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014104 return NULL;
14105}
14106
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014107/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014108
Victor Stinnera47082312012-10-04 02:19:54 +020014109/* Format a float into the writer if the writer is not NULL, or into *p_output
14110 otherwise.
14111
14112 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014113static int
Victor Stinnera47082312012-10-04 02:19:54 +020014114formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14115 PyObject **p_output,
14116 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014117{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014118 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014121 int prec;
14122 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014123
Guido van Rossumd57fd912000-03-10 22:53:23 +000014124 x = PyFloat_AsDouble(v);
14125 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014126 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014127
Victor Stinnera47082312012-10-04 02:19:54 +020014128 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014129 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014130 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014131
Victor Stinnera47082312012-10-04 02:19:54 +020014132 if (arg->flags & F_ALT)
14133 dtoa_flags = Py_DTSF_ALT;
14134 else
14135 dtoa_flags = 0;
14136 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014137 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014138 return -1;
14139 len = strlen(p);
14140 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014141 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014142 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014143 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014144 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014145 }
14146 else
14147 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014148 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014149 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014150}
14151
Victor Stinnerd0880d52012-04-27 23:40:13 +020014152/* formatlong() emulates the format codes d, u, o, x and X, and
14153 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14154 * Python's regular ints.
14155 * Return value: a new PyUnicodeObject*, or NULL if error.
14156 * The output string is of the form
14157 * "-"? ("0x" | "0X")? digit+
14158 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14159 * set in flags. The case of hex digits will be correct,
14160 * There will be at least prec digits, zero-filled on the left if
14161 * necessary to get that many.
14162 * val object to be converted
14163 * flags bitmask of format flags; only F_ALT is looked at
14164 * prec minimum number of digits; 0-fill on left if needed
14165 * type a character in [duoxX]; u acts the same as d
14166 *
14167 * CAUTION: o, x and X conversions on regular ints can never
14168 * produce a '-' sign, but can for Python's unbounded ints.
14169 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014170PyObject *
14171_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014172{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014173 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014174 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014175 Py_ssize_t i;
14176 int sign; /* 1 if '-', else 0 */
14177 int len; /* number of characters */
14178 Py_ssize_t llen;
14179 int numdigits; /* len == numnondigits + numdigits */
14180 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014181
Victor Stinnerd0880d52012-04-27 23:40:13 +020014182 /* Avoid exceeding SSIZE_T_MAX */
14183 if (prec > INT_MAX-3) {
14184 PyErr_SetString(PyExc_OverflowError,
14185 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014186 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014187 }
14188
14189 assert(PyLong_Check(val));
14190
14191 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014192 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014193 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014195 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014196 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014197 /* int and int subclasses should print numerically when a numeric */
14198 /* format code is used (see issue18780) */
14199 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014200 break;
14201 case 'o':
14202 numnondigits = 2;
14203 result = PyNumber_ToBase(val, 8);
14204 break;
14205 case 'x':
14206 case 'X':
14207 numnondigits = 2;
14208 result = PyNumber_ToBase(val, 16);
14209 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014210 }
14211 if (!result)
14212 return NULL;
14213
14214 assert(unicode_modifiable(result));
14215 assert(PyUnicode_IS_READY(result));
14216 assert(PyUnicode_IS_ASCII(result));
14217
14218 /* To modify the string in-place, there can only be one reference. */
14219 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014220 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014221 PyErr_BadInternalCall();
14222 return NULL;
14223 }
14224 buf = PyUnicode_DATA(result);
14225 llen = PyUnicode_GET_LENGTH(result);
14226 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014227 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014228 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014229 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014230 return NULL;
14231 }
14232 len = (int)llen;
14233 sign = buf[0] == '-';
14234 numnondigits += sign;
14235 numdigits = len - numnondigits;
14236 assert(numdigits > 0);
14237
14238 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014239 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014240 (type == 'o' || type == 'x' || type == 'X'))) {
14241 assert(buf[sign] == '0');
14242 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14243 buf[sign+1] == 'o');
14244 numnondigits -= 2;
14245 buf += 2;
14246 len -= 2;
14247 if (sign)
14248 buf[0] = '-';
14249 assert(len == numnondigits + numdigits);
14250 assert(numdigits > 0);
14251 }
14252
14253 /* Fill with leading zeroes to meet minimum width. */
14254 if (prec > numdigits) {
14255 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14256 numnondigits + prec);
14257 char *b1;
14258 if (!r1) {
14259 Py_DECREF(result);
14260 return NULL;
14261 }
14262 b1 = PyBytes_AS_STRING(r1);
14263 for (i = 0; i < numnondigits; ++i)
14264 *b1++ = *buf++;
14265 for (i = 0; i < prec - numdigits; i++)
14266 *b1++ = '0';
14267 for (i = 0; i < numdigits; i++)
14268 *b1++ = *buf++;
14269 *b1 = '\0';
14270 Py_DECREF(result);
14271 result = r1;
14272 buf = PyBytes_AS_STRING(result);
14273 len = numnondigits + prec;
14274 }
14275
14276 /* Fix up case for hex conversions. */
14277 if (type == 'X') {
14278 /* Need to convert all lower case letters to upper case.
14279 and need to convert 0x to 0X (and -0x to -0X). */
14280 for (i = 0; i < len; i++)
14281 if (buf[i] >= 'a' && buf[i] <= 'x')
14282 buf[i] -= 'a'-'A';
14283 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014284 if (!PyUnicode_Check(result)
14285 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014286 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014287 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014288 Py_DECREF(result);
14289 result = unicode;
14290 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014291 else if (len != PyUnicode_GET_LENGTH(result)) {
14292 if (PyUnicode_Resize(&result, len) < 0)
14293 Py_CLEAR(result);
14294 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014295 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014296}
14297
Ethan Furmandf3ed242014-01-05 06:50:30 -080014298/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014299 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014300 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301 * -1 and raise an exception on error */
14302static int
Victor Stinnera47082312012-10-04 02:19:54 +020014303mainformatlong(PyObject *v,
14304 struct unicode_format_arg_t *arg,
14305 PyObject **p_output,
14306 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307{
14308 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014309 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014310
14311 if (!PyNumber_Check(v))
14312 goto wrongtype;
14313
Ethan Furman9ab74802014-03-21 06:38:46 -070014314 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014315 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014316 if (type == 'o' || type == 'x' || type == 'X') {
14317 iobj = PyNumber_Index(v);
14318 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014319 if (PyErr_ExceptionMatches(PyExc_TypeError))
14320 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014321 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014322 }
14323 }
14324 else {
14325 iobj = PyNumber_Long(v);
14326 if (iobj == NULL ) {
14327 if (PyErr_ExceptionMatches(PyExc_TypeError))
14328 goto wrongtype;
14329 return -1;
14330 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014331 }
14332 assert(PyLong_Check(iobj));
14333 }
14334 else {
14335 iobj = v;
14336 Py_INCREF(iobj);
14337 }
14338
14339 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014340 && arg->width == -1 && arg->prec == -1
14341 && !(arg->flags & (F_SIGN | F_BLANK))
14342 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014343 {
14344 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014345 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014346 int base;
14347
Victor Stinnera47082312012-10-04 02:19:54 +020014348 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014349 {
14350 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014351 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014352 case 'd':
14353 case 'i':
14354 case 'u':
14355 base = 10;
14356 break;
14357 case 'o':
14358 base = 8;
14359 break;
14360 case 'x':
14361 case 'X':
14362 base = 16;
14363 break;
14364 }
14365
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014366 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14367 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014368 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014369 }
14370 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014371 return 1;
14372 }
14373
Ethan Furmanb95b5612015-01-23 20:05:18 -080014374 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014375 Py_DECREF(iobj);
14376 if (res == NULL)
14377 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014378 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014379 return 0;
14380
14381wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014382 switch(type)
14383 {
14384 case 'o':
14385 case 'x':
14386 case 'X':
14387 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014388 "%%%c format: an integer is required, "
14389 "not %.200s",
14390 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014391 break;
14392 default:
14393 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014394 "%%%c format: a number is required, "
14395 "not %.200s",
14396 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014397 break;
14398 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014399 return -1;
14400}
14401
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014402static Py_UCS4
14403formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014404{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014405 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014406 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014407 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014408 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014409 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014410 goto onError;
14411 }
14412 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014413 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014414 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014415 /* make sure number is a type of integer */
14416 if (!PyLong_Check(v)) {
14417 iobj = PyNumber_Index(v);
14418 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014419 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014420 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014421 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014422 Py_DECREF(iobj);
14423 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014424 else {
14425 x = PyLong_AsLong(v);
14426 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014427 if (x == -1 && PyErr_Occurred())
14428 goto onError;
14429
Victor Stinner8faf8212011-12-08 22:14:11 +010014430 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014431 PyErr_SetString(PyExc_OverflowError,
14432 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014433 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014434 }
14435
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014436 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014437 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014438
Benjamin Peterson29060642009-01-31 22:14:21 +000014439 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014440 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014441 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014442 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014443}
14444
Victor Stinnera47082312012-10-04 02:19:54 +020014445/* Parse options of an argument: flags, width, precision.
14446 Handle also "%(name)" syntax.
14447
14448 Return 0 if the argument has been formatted into arg->str.
14449 Return 1 if the argument has been written into ctx->writer,
14450 Raise an exception and return -1 on error. */
14451static int
14452unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14453 struct unicode_format_arg_t *arg)
14454{
14455#define FORMAT_READ(ctx) \
14456 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14457
14458 PyObject *v;
14459
Victor Stinnera47082312012-10-04 02:19:54 +020014460 if (arg->ch == '(') {
14461 /* Get argument value from a dictionary. Example: "%(name)s". */
14462 Py_ssize_t keystart;
14463 Py_ssize_t keylen;
14464 PyObject *key;
14465 int pcount = 1;
14466
14467 if (ctx->dict == NULL) {
14468 PyErr_SetString(PyExc_TypeError,
14469 "format requires a mapping");
14470 return -1;
14471 }
14472 ++ctx->fmtpos;
14473 --ctx->fmtcnt;
14474 keystart = ctx->fmtpos;
14475 /* Skip over balanced parentheses */
14476 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14477 arg->ch = FORMAT_READ(ctx);
14478 if (arg->ch == ')')
14479 --pcount;
14480 else if (arg->ch == '(')
14481 ++pcount;
14482 ctx->fmtpos++;
14483 }
14484 keylen = ctx->fmtpos - keystart - 1;
14485 if (ctx->fmtcnt < 0 || pcount > 0) {
14486 PyErr_SetString(PyExc_ValueError,
14487 "incomplete format key");
14488 return -1;
14489 }
14490 key = PyUnicode_Substring(ctx->fmtstr,
14491 keystart, keystart + keylen);
14492 if (key == NULL)
14493 return -1;
14494 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014495 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014496 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014497 }
14498 ctx->args = PyObject_GetItem(ctx->dict, key);
14499 Py_DECREF(key);
14500 if (ctx->args == NULL)
14501 return -1;
14502 ctx->args_owned = 1;
14503 ctx->arglen = -1;
14504 ctx->argidx = -2;
14505 }
14506
14507 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014508 while (--ctx->fmtcnt >= 0) {
14509 arg->ch = FORMAT_READ(ctx);
14510 ctx->fmtpos++;
14511 switch (arg->ch) {
14512 case '-': arg->flags |= F_LJUST; continue;
14513 case '+': arg->flags |= F_SIGN; continue;
14514 case ' ': arg->flags |= F_BLANK; continue;
14515 case '#': arg->flags |= F_ALT; continue;
14516 case '0': arg->flags |= F_ZERO; continue;
14517 }
14518 break;
14519 }
14520
14521 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014522 if (arg->ch == '*') {
14523 v = unicode_format_getnextarg(ctx);
14524 if (v == NULL)
14525 return -1;
14526 if (!PyLong_Check(v)) {
14527 PyErr_SetString(PyExc_TypeError,
14528 "* wants int");
14529 return -1;
14530 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014531 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014532 if (arg->width == -1 && PyErr_Occurred())
14533 return -1;
14534 if (arg->width < 0) {
14535 arg->flags |= F_LJUST;
14536 arg->width = -arg->width;
14537 }
14538 if (--ctx->fmtcnt >= 0) {
14539 arg->ch = FORMAT_READ(ctx);
14540 ctx->fmtpos++;
14541 }
14542 }
14543 else if (arg->ch >= '0' && arg->ch <= '9') {
14544 arg->width = arg->ch - '0';
14545 while (--ctx->fmtcnt >= 0) {
14546 arg->ch = FORMAT_READ(ctx);
14547 ctx->fmtpos++;
14548 if (arg->ch < '0' || arg->ch > '9')
14549 break;
14550 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14551 mixing signed and unsigned comparison. Since arg->ch is between
14552 '0' and '9', casting to int is safe. */
14553 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14554 PyErr_SetString(PyExc_ValueError,
14555 "width too big");
14556 return -1;
14557 }
14558 arg->width = arg->width*10 + (arg->ch - '0');
14559 }
14560 }
14561
14562 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014563 if (arg->ch == '.') {
14564 arg->prec = 0;
14565 if (--ctx->fmtcnt >= 0) {
14566 arg->ch = FORMAT_READ(ctx);
14567 ctx->fmtpos++;
14568 }
14569 if (arg->ch == '*') {
14570 v = unicode_format_getnextarg(ctx);
14571 if (v == NULL)
14572 return -1;
14573 if (!PyLong_Check(v)) {
14574 PyErr_SetString(PyExc_TypeError,
14575 "* wants int");
14576 return -1;
14577 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014578 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014579 if (arg->prec == -1 && PyErr_Occurred())
14580 return -1;
14581 if (arg->prec < 0)
14582 arg->prec = 0;
14583 if (--ctx->fmtcnt >= 0) {
14584 arg->ch = FORMAT_READ(ctx);
14585 ctx->fmtpos++;
14586 }
14587 }
14588 else if (arg->ch >= '0' && arg->ch <= '9') {
14589 arg->prec = arg->ch - '0';
14590 while (--ctx->fmtcnt >= 0) {
14591 arg->ch = FORMAT_READ(ctx);
14592 ctx->fmtpos++;
14593 if (arg->ch < '0' || arg->ch > '9')
14594 break;
14595 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14596 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014597 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014598 return -1;
14599 }
14600 arg->prec = arg->prec*10 + (arg->ch - '0');
14601 }
14602 }
14603 }
14604
14605 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14606 if (ctx->fmtcnt >= 0) {
14607 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14608 if (--ctx->fmtcnt >= 0) {
14609 arg->ch = FORMAT_READ(ctx);
14610 ctx->fmtpos++;
14611 }
14612 }
14613 }
14614 if (ctx->fmtcnt < 0) {
14615 PyErr_SetString(PyExc_ValueError,
14616 "incomplete format");
14617 return -1;
14618 }
14619 return 0;
14620
14621#undef FORMAT_READ
14622}
14623
14624/* Format one argument. Supported conversion specifiers:
14625
14626 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014627 - "i", "d", "u": int or float
14628 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014629 - "e", "E", "f", "F", "g", "G": float
14630 - "c": int or str (1 character)
14631
Victor Stinner8dbd4212012-12-04 09:30:24 +010014632 When possible, the output is written directly into the Unicode writer
14633 (ctx->writer). A string is created when padding is required.
14634
Victor Stinnera47082312012-10-04 02:19:54 +020014635 Return 0 if the argument has been formatted into *p_str,
14636 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014637 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014638static int
14639unicode_format_arg_format(struct unicode_formatter_t *ctx,
14640 struct unicode_format_arg_t *arg,
14641 PyObject **p_str)
14642{
14643 PyObject *v;
14644 _PyUnicodeWriter *writer = &ctx->writer;
14645
14646 if (ctx->fmtcnt == 0)
14647 ctx->writer.overallocate = 0;
14648
Victor Stinnera47082312012-10-04 02:19:54 +020014649 v = unicode_format_getnextarg(ctx);
14650 if (v == NULL)
14651 return -1;
14652
Victor Stinnera47082312012-10-04 02:19:54 +020014653
14654 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014655 case 's':
14656 case 'r':
14657 case 'a':
14658 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14659 /* Fast path */
14660 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14661 return -1;
14662 return 1;
14663 }
14664
14665 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14666 *p_str = v;
14667 Py_INCREF(*p_str);
14668 }
14669 else {
14670 if (arg->ch == 's')
14671 *p_str = PyObject_Str(v);
14672 else if (arg->ch == 'r')
14673 *p_str = PyObject_Repr(v);
14674 else
14675 *p_str = PyObject_ASCII(v);
14676 }
14677 break;
14678
14679 case 'i':
14680 case 'd':
14681 case 'u':
14682 case 'o':
14683 case 'x':
14684 case 'X':
14685 {
14686 int ret = mainformatlong(v, arg, p_str, writer);
14687 if (ret != 0)
14688 return ret;
14689 arg->sign = 1;
14690 break;
14691 }
14692
14693 case 'e':
14694 case 'E':
14695 case 'f':
14696 case 'F':
14697 case 'g':
14698 case 'G':
14699 if (arg->width == -1 && arg->prec == -1
14700 && !(arg->flags & (F_SIGN | F_BLANK)))
14701 {
14702 /* Fast path */
14703 if (formatfloat(v, arg, NULL, writer) == -1)
14704 return -1;
14705 return 1;
14706 }
14707
14708 arg->sign = 1;
14709 if (formatfloat(v, arg, p_str, NULL) == -1)
14710 return -1;
14711 break;
14712
14713 case 'c':
14714 {
14715 Py_UCS4 ch = formatchar(v);
14716 if (ch == (Py_UCS4) -1)
14717 return -1;
14718 if (arg->width == -1 && arg->prec == -1) {
14719 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014720 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014721 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014722 return 1;
14723 }
14724 *p_str = PyUnicode_FromOrdinal(ch);
14725 break;
14726 }
14727
14728 default:
14729 PyErr_Format(PyExc_ValueError,
14730 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014731 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014732 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14733 (int)arg->ch,
14734 ctx->fmtpos - 1);
14735 return -1;
14736 }
14737 if (*p_str == NULL)
14738 return -1;
14739 assert (PyUnicode_Check(*p_str));
14740 return 0;
14741}
14742
14743static int
14744unicode_format_arg_output(struct unicode_formatter_t *ctx,
14745 struct unicode_format_arg_t *arg,
14746 PyObject *str)
14747{
14748 Py_ssize_t len;
14749 enum PyUnicode_Kind kind;
14750 void *pbuf;
14751 Py_ssize_t pindex;
14752 Py_UCS4 signchar;
14753 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014754 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014755 Py_ssize_t sublen;
14756 _PyUnicodeWriter *writer = &ctx->writer;
14757 Py_UCS4 fill;
14758
14759 fill = ' ';
14760 if (arg->sign && arg->flags & F_ZERO)
14761 fill = '0';
14762
14763 if (PyUnicode_READY(str) == -1)
14764 return -1;
14765
14766 len = PyUnicode_GET_LENGTH(str);
14767 if ((arg->width == -1 || arg->width <= len)
14768 && (arg->prec == -1 || arg->prec >= len)
14769 && !(arg->flags & (F_SIGN | F_BLANK)))
14770 {
14771 /* Fast path */
14772 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14773 return -1;
14774 return 0;
14775 }
14776
14777 /* Truncate the string for "s", "r" and "a" formats
14778 if the precision is set */
14779 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14780 if (arg->prec >= 0 && len > arg->prec)
14781 len = arg->prec;
14782 }
14783
14784 /* Adjust sign and width */
14785 kind = PyUnicode_KIND(str);
14786 pbuf = PyUnicode_DATA(str);
14787 pindex = 0;
14788 signchar = '\0';
14789 if (arg->sign) {
14790 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14791 if (ch == '-' || ch == '+') {
14792 signchar = ch;
14793 len--;
14794 pindex++;
14795 }
14796 else if (arg->flags & F_SIGN)
14797 signchar = '+';
14798 else if (arg->flags & F_BLANK)
14799 signchar = ' ';
14800 else
14801 arg->sign = 0;
14802 }
14803 if (arg->width < len)
14804 arg->width = len;
14805
14806 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014807 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014808 if (!(arg->flags & F_LJUST)) {
14809 if (arg->sign) {
14810 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014811 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014812 }
14813 else {
14814 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014815 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014816 }
14817 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014818 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14819 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014820 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014821 }
14822
Victor Stinnera47082312012-10-04 02:19:54 +020014823 buflen = arg->width;
14824 if (arg->sign && len == arg->width)
14825 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014826 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014827 return -1;
14828
14829 /* Write the sign if needed */
14830 if (arg->sign) {
14831 if (fill != ' ') {
14832 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14833 writer->pos += 1;
14834 }
14835 if (arg->width > len)
14836 arg->width--;
14837 }
14838
14839 /* Write the numeric prefix for "x", "X" and "o" formats
14840 if the alternate form is used.
14841 For example, write "0x" for the "%#x" format. */
14842 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14843 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14844 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14845 if (fill != ' ') {
14846 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14847 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14848 writer->pos += 2;
14849 pindex += 2;
14850 }
14851 arg->width -= 2;
14852 if (arg->width < 0)
14853 arg->width = 0;
14854 len -= 2;
14855 }
14856
14857 /* Pad left with the fill character if needed */
14858 if (arg->width > len && !(arg->flags & F_LJUST)) {
14859 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014860 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014861 writer->pos += sublen;
14862 arg->width = len;
14863 }
14864
14865 /* If padding with spaces: write sign if needed and/or numeric prefix if
14866 the alternate form is used */
14867 if (fill == ' ') {
14868 if (arg->sign) {
14869 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14870 writer->pos += 1;
14871 }
14872 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14873 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14874 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14875 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14876 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14877 writer->pos += 2;
14878 pindex += 2;
14879 }
14880 }
14881
14882 /* Write characters */
14883 if (len) {
14884 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14885 str, pindex, len);
14886 writer->pos += len;
14887 }
14888
14889 /* Pad right with the fill character if needed */
14890 if (arg->width > len) {
14891 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014892 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014893 writer->pos += sublen;
14894 }
14895 return 0;
14896}
14897
14898/* Helper of PyUnicode_Format(): format one arg.
14899 Return 0 on success, raise an exception and return -1 on error. */
14900static int
14901unicode_format_arg(struct unicode_formatter_t *ctx)
14902{
14903 struct unicode_format_arg_t arg;
14904 PyObject *str;
14905 int ret;
14906
Victor Stinner8dbd4212012-12-04 09:30:24 +010014907 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014908 if (arg.ch == '%') {
14909 ctx->fmtpos++;
14910 ctx->fmtcnt--;
14911 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14912 return -1;
14913 return 0;
14914 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014915 arg.flags = 0;
14916 arg.width = -1;
14917 arg.prec = -1;
14918 arg.sign = 0;
14919 str = NULL;
14920
Victor Stinnera47082312012-10-04 02:19:54 +020014921 ret = unicode_format_arg_parse(ctx, &arg);
14922 if (ret == -1)
14923 return -1;
14924
14925 ret = unicode_format_arg_format(ctx, &arg, &str);
14926 if (ret == -1)
14927 return -1;
14928
14929 if (ret != 1) {
14930 ret = unicode_format_arg_output(ctx, &arg, str);
14931 Py_DECREF(str);
14932 if (ret == -1)
14933 return -1;
14934 }
14935
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014936 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014937 PyErr_SetString(PyExc_TypeError,
14938 "not all arguments converted during string formatting");
14939 return -1;
14940 }
14941 return 0;
14942}
14943
Alexander Belopolsky40018472011-02-26 01:02:56 +000014944PyObject *
14945PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946{
Victor Stinnera47082312012-10-04 02:19:54 +020014947 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014948
Guido van Rossumd57fd912000-03-10 22:53:23 +000014949 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014950 PyErr_BadInternalCall();
14951 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014952 }
Victor Stinnera47082312012-10-04 02:19:54 +020014953
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014954 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014955 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014956
14957 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014958 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14959 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14960 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14961 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014962
Victor Stinner8f674cc2013-04-17 23:02:17 +020014963 _PyUnicodeWriter_Init(&ctx.writer);
14964 ctx.writer.min_length = ctx.fmtcnt + 100;
14965 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014966
Guido van Rossumd57fd912000-03-10 22:53:23 +000014967 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014968 ctx.arglen = PyTuple_Size(args);
14969 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014970 }
14971 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014972 ctx.arglen = -1;
14973 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014974 }
Victor Stinnera47082312012-10-04 02:19:54 +020014975 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014976 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014977 ctx.dict = args;
14978 else
14979 ctx.dict = NULL;
14980 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014981
Victor Stinnera47082312012-10-04 02:19:54 +020014982 while (--ctx.fmtcnt >= 0) {
14983 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014984 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014985
14986 nonfmtpos = ctx.fmtpos++;
14987 while (ctx.fmtcnt >= 0 &&
14988 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14989 ctx.fmtpos++;
14990 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014991 }
Victor Stinnera47082312012-10-04 02:19:54 +020014992 if (ctx.fmtcnt < 0) {
14993 ctx.fmtpos--;
14994 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014995 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014996
Victor Stinnercfc4c132013-04-03 01:48:39 +020014997 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14998 nonfmtpos, ctx.fmtpos) < 0)
14999 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015000 }
15001 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015002 ctx.fmtpos++;
15003 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015004 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015005 }
15006 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015007
Victor Stinnera47082312012-10-04 02:19:54 +020015008 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015009 PyErr_SetString(PyExc_TypeError,
15010 "not all arguments converted during string formatting");
15011 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015012 }
15013
Victor Stinnera47082312012-10-04 02:19:54 +020015014 if (ctx.args_owned) {
15015 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015016 }
Victor Stinnera47082312012-10-04 02:19:54 +020015017 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015018
Benjamin Peterson29060642009-01-31 22:14:21 +000015019 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015020 _PyUnicodeWriter_Dealloc(&ctx.writer);
15021 if (ctx.args_owned) {
15022 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015023 }
15024 return NULL;
15025}
15026
Jeremy Hylton938ace62002-07-17 16:30:39 +000015027static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015028unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15029
Tim Peters6d6c1a32001-08-02 04:15:00 +000015030static PyObject *
15031unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15032{
Benjamin Peterson29060642009-01-31 22:14:21 +000015033 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015034 static char *kwlist[] = {"object", "encoding", "errors", 0};
15035 char *encoding = NULL;
15036 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015037
Benjamin Peterson14339b62009-01-31 16:36:08 +000015038 if (type != &PyUnicode_Type)
15039 return unicode_subtype_new(type, args, kwds);
15040 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015041 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015042 return NULL;
15043 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015044 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015045 if (encoding == NULL && errors == NULL)
15046 return PyObject_Str(x);
15047 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015048 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015049}
15050
Guido van Rossume023fe02001-08-30 03:12:59 +000015051static PyObject *
15052unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15053{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015054 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055 Py_ssize_t length, char_size;
15056 int share_wstr, share_utf8;
15057 unsigned int kind;
15058 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015059
Benjamin Peterson14339b62009-01-31 16:36:08 +000015060 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015061
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015062 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015063 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015065 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015066 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015067 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015068 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015069 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015070
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015071 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015072 if (self == NULL) {
15073 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015074 return NULL;
15075 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015076 kind = PyUnicode_KIND(unicode);
15077 length = PyUnicode_GET_LENGTH(unicode);
15078
15079 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015080#ifdef Py_DEBUG
15081 _PyUnicode_HASH(self) = -1;
15082#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015083 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015084#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015085 _PyUnicode_STATE(self).interned = 0;
15086 _PyUnicode_STATE(self).kind = kind;
15087 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015088 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015089 _PyUnicode_STATE(self).ready = 1;
15090 _PyUnicode_WSTR(self) = NULL;
15091 _PyUnicode_UTF8_LENGTH(self) = 0;
15092 _PyUnicode_UTF8(self) = NULL;
15093 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015094 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015095
15096 share_utf8 = 0;
15097 share_wstr = 0;
15098 if (kind == PyUnicode_1BYTE_KIND) {
15099 char_size = 1;
15100 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15101 share_utf8 = 1;
15102 }
15103 else if (kind == PyUnicode_2BYTE_KIND) {
15104 char_size = 2;
15105 if (sizeof(wchar_t) == 2)
15106 share_wstr = 1;
15107 }
15108 else {
15109 assert(kind == PyUnicode_4BYTE_KIND);
15110 char_size = 4;
15111 if (sizeof(wchar_t) == 4)
15112 share_wstr = 1;
15113 }
15114
15115 /* Ensure we won't overflow the length. */
15116 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15117 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015118 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015119 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015120 data = PyObject_MALLOC((length + 1) * char_size);
15121 if (data == NULL) {
15122 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015123 goto onError;
15124 }
15125
Victor Stinnerc3c74152011-10-02 20:39:55 +020015126 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015127 if (share_utf8) {
15128 _PyUnicode_UTF8_LENGTH(self) = length;
15129 _PyUnicode_UTF8(self) = data;
15130 }
15131 if (share_wstr) {
15132 _PyUnicode_WSTR_LENGTH(self) = length;
15133 _PyUnicode_WSTR(self) = (wchar_t *)data;
15134 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015135
Christian Heimesf051e432016-09-13 20:22:02 +020015136 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015137 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015138 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015139#ifdef Py_DEBUG
15140 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15141#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015142 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015143 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015144
15145onError:
15146 Py_DECREF(unicode);
15147 Py_DECREF(self);
15148 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015149}
15150
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015151PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015152"str(object='') -> str\n\
15153str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015154\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015155Create a new string object from the given object. If encoding or\n\
15156errors is specified, then the object must expose a data buffer\n\
15157that will be decoded using the given encoding and error handler.\n\
15158Otherwise, returns the result of object.__str__() (if defined)\n\
15159or repr(object).\n\
15160encoding defaults to sys.getdefaultencoding().\n\
15161errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015162
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015163static PyObject *unicode_iter(PyObject *seq);
15164
Guido van Rossumd57fd912000-03-10 22:53:23 +000015165PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015166 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015167 "str", /* tp_name */
15168 sizeof(PyUnicodeObject), /* tp_basicsize */
15169 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015170 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015171 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015172 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015173 0, /* tp_getattr */
15174 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015175 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015176 unicode_repr, /* tp_repr */
15177 &unicode_as_number, /* tp_as_number */
15178 &unicode_as_sequence, /* tp_as_sequence */
15179 &unicode_as_mapping, /* tp_as_mapping */
15180 (hashfunc) unicode_hash, /* tp_hash*/
15181 0, /* tp_call*/
15182 (reprfunc) unicode_str, /* tp_str */
15183 PyObject_GenericGetAttr, /* tp_getattro */
15184 0, /* tp_setattro */
15185 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015186 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015187 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15188 unicode_doc, /* tp_doc */
15189 0, /* tp_traverse */
15190 0, /* tp_clear */
15191 PyUnicode_RichCompare, /* tp_richcompare */
15192 0, /* tp_weaklistoffset */
15193 unicode_iter, /* tp_iter */
15194 0, /* tp_iternext */
15195 unicode_methods, /* tp_methods */
15196 0, /* tp_members */
15197 0, /* tp_getset */
15198 &PyBaseObject_Type, /* tp_base */
15199 0, /* tp_dict */
15200 0, /* tp_descr_get */
15201 0, /* tp_descr_set */
15202 0, /* tp_dictoffset */
15203 0, /* tp_init */
15204 0, /* tp_alloc */
15205 unicode_new, /* tp_new */
15206 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015207};
15208
15209/* Initialize the Unicode implementation */
15210
Victor Stinner331a6a52019-05-27 16:39:22 +020015211PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015212_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015213{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015214 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015215 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015216 0x000A, /* LINE FEED */
15217 0x000D, /* CARRIAGE RETURN */
15218 0x001C, /* FILE SEPARATOR */
15219 0x001D, /* GROUP SEPARATOR */
15220 0x001E, /* RECORD SEPARATOR */
15221 0x0085, /* NEXT LINE */
15222 0x2028, /* LINE SEPARATOR */
15223 0x2029, /* PARAGRAPH SEPARATOR */
15224 };
15225
Fred Drakee4315f52000-05-09 19:53:39 +000015226 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015227 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015228 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015229 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015230 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015231 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015232
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015233 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015234 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015235 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015236
15237 /* initialize the linebreak bloom filter */
15238 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015239 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015240 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015241
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015242 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015243 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015244 }
15245 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015246 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015247 }
15248 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015249 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015250 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015251 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015252}
15253
15254/* Finalize the Unicode implementation */
15255
Christian Heimesa156e092008-02-16 07:38:31 +000015256int
15257PyUnicode_ClearFreeList(void)
15258{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015259 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015260}
15261
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015262
Walter Dörwald16807132007-05-25 13:52:07 +000015263void
15264PyUnicode_InternInPlace(PyObject **p)
15265{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015266 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015268#ifdef Py_DEBUG
15269 assert(s != NULL);
15270 assert(_PyUnicode_CHECK(s));
15271#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015272 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015273 return;
15274#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015275 /* If it's a subclass, we don't really know what putting
15276 it in the interned dict might do. */
15277 if (!PyUnicode_CheckExact(s))
15278 return;
15279 if (PyUnicode_CHECK_INTERNED(s))
15280 return;
15281 if (interned == NULL) {
15282 interned = PyDict_New();
15283 if (interned == NULL) {
15284 PyErr_Clear(); /* Don't leave an exception */
15285 return;
15286 }
15287 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015288 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015289 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015291 if (t == NULL) {
15292 PyErr_Clear();
15293 return;
15294 }
15295 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015296 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015297 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015298 return;
15299 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015300 /* The two references in interned are not counted by refcnt.
15301 The deallocator will take care of this */
15302 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015303 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015304}
15305
15306void
15307PyUnicode_InternImmortal(PyObject **p)
15308{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 PyUnicode_InternInPlace(p);
15310 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015311 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015312 Py_INCREF(*p);
15313 }
Walter Dörwald16807132007-05-25 13:52:07 +000015314}
15315
15316PyObject *
15317PyUnicode_InternFromString(const char *cp)
15318{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015319 PyObject *s = PyUnicode_FromString(cp);
15320 if (s == NULL)
15321 return NULL;
15322 PyUnicode_InternInPlace(&s);
15323 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015324}
15325
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015326
15327#if defined(WITH_VALGRIND) || defined(__INSURE__)
15328static void
15329unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015330{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015331 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015332 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015333 Py_ssize_t i, n;
15334 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015335
Benjamin Peterson14339b62009-01-31 16:36:08 +000015336 if (interned == NULL || !PyDict_Check(interned))
15337 return;
15338 keys = PyDict_Keys(interned);
15339 if (keys == NULL || !PyList_Check(keys)) {
15340 PyErr_Clear();
15341 return;
15342 }
Walter Dörwald16807132007-05-25 13:52:07 +000015343
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015344 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 detector, interned unicode strings are not forcibly deallocated;
15346 rather, we give them their stolen references back, and then clear
15347 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015348
Benjamin Peterson14339b62009-01-31 16:36:08 +000015349 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015350#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015351 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015352 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015353#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015355 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015356 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015357 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015358 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015359 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015360 case SSTATE_NOT_INTERNED:
15361 /* XXX Shouldn't happen */
15362 break;
15363 case SSTATE_INTERNED_IMMORTAL:
15364 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015365 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015366 break;
15367 case SSTATE_INTERNED_MORTAL:
15368 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015369 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015370 break;
15371 default:
15372 Py_FatalError("Inconsistent interned string state.");
15373 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015374 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015375 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015376#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015377 fprintf(stderr, "total size of all interned strings: "
15378 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15379 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015380#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015381 Py_DECREF(keys);
15382 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015383 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015384}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015385#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015386
15387
15388/********************* Unicode Iterator **************************/
15389
15390typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015391 PyObject_HEAD
15392 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015393 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015394} unicodeiterobject;
15395
15396static void
15397unicodeiter_dealloc(unicodeiterobject *it)
15398{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015399 _PyObject_GC_UNTRACK(it);
15400 Py_XDECREF(it->it_seq);
15401 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015402}
15403
15404static int
15405unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15406{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015407 Py_VISIT(it->it_seq);
15408 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015409}
15410
15411static PyObject *
15412unicodeiter_next(unicodeiterobject *it)
15413{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015414 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015415
Benjamin Peterson14339b62009-01-31 16:36:08 +000015416 assert(it != NULL);
15417 seq = it->it_seq;
15418 if (seq == NULL)
15419 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015420 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015421
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015422 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15423 int kind = PyUnicode_KIND(seq);
15424 void *data = PyUnicode_DATA(seq);
15425 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15426 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015427 if (item != NULL)
15428 ++it->it_index;
15429 return item;
15430 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015431
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015433 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015434 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015435}
15436
15437static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015438unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015439{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015440 Py_ssize_t len = 0;
15441 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015442 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015444}
15445
15446PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15447
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015448static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015449unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015450{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015451 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015452 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015453 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015454 it->it_seq, it->it_index);
15455 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015456 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015457 if (u == NULL)
15458 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015459 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015460 }
15461}
15462
15463PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15464
15465static PyObject *
15466unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15467{
15468 Py_ssize_t index = PyLong_AsSsize_t(state);
15469 if (index == -1 && PyErr_Occurred())
15470 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015471 if (it->it_seq != NULL) {
15472 if (index < 0)
15473 index = 0;
15474 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15475 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15476 it->it_index = index;
15477 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015478 Py_RETURN_NONE;
15479}
15480
15481PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15482
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015483static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015484 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015485 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015486 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15487 reduce_doc},
15488 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15489 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015490 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015491};
15492
15493PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15495 "str_iterator", /* tp_name */
15496 sizeof(unicodeiterobject), /* tp_basicsize */
15497 0, /* tp_itemsize */
15498 /* methods */
15499 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015500 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015501 0, /* tp_getattr */
15502 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015503 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015504 0, /* tp_repr */
15505 0, /* tp_as_number */
15506 0, /* tp_as_sequence */
15507 0, /* tp_as_mapping */
15508 0, /* tp_hash */
15509 0, /* tp_call */
15510 0, /* tp_str */
15511 PyObject_GenericGetAttr, /* tp_getattro */
15512 0, /* tp_setattro */
15513 0, /* tp_as_buffer */
15514 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15515 0, /* tp_doc */
15516 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15517 0, /* tp_clear */
15518 0, /* tp_richcompare */
15519 0, /* tp_weaklistoffset */
15520 PyObject_SelfIter, /* tp_iter */
15521 (iternextfunc)unicodeiter_next, /* tp_iternext */
15522 unicodeiter_methods, /* tp_methods */
15523 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015524};
15525
15526static PyObject *
15527unicode_iter(PyObject *seq)
15528{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015529 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015530
Benjamin Peterson14339b62009-01-31 16:36:08 +000015531 if (!PyUnicode_Check(seq)) {
15532 PyErr_BadInternalCall();
15533 return NULL;
15534 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015535 if (PyUnicode_READY(seq) == -1)
15536 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015537 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15538 if (it == NULL)
15539 return NULL;
15540 it->it_index = 0;
15541 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015542 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015543 _PyObject_GC_TRACK(it);
15544 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015545}
15546
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015547
15548size_t
15549Py_UNICODE_strlen(const Py_UNICODE *u)
15550{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015551 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015552}
15553
15554Py_UNICODE*
15555Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15556{
15557 Py_UNICODE *u = s1;
15558 while ((*u++ = *s2++));
15559 return s1;
15560}
15561
15562Py_UNICODE*
15563Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15564{
15565 Py_UNICODE *u = s1;
15566 while ((*u++ = *s2++))
15567 if (n-- == 0)
15568 break;
15569 return s1;
15570}
15571
15572Py_UNICODE*
15573Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15574{
15575 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015576 u1 += wcslen(u1);
15577 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015578 return s1;
15579}
15580
15581int
15582Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15583{
15584 while (*s1 && *s2 && *s1 == *s2)
15585 s1++, s2++;
15586 if (*s1 && *s2)
15587 return (*s1 < *s2) ? -1 : +1;
15588 if (*s1)
15589 return 1;
15590 if (*s2)
15591 return -1;
15592 return 0;
15593}
15594
15595int
15596Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15597{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015598 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015599 for (; n != 0; n--) {
15600 u1 = *s1;
15601 u2 = *s2;
15602 if (u1 != u2)
15603 return (u1 < u2) ? -1 : +1;
15604 if (u1 == '\0')
15605 return 0;
15606 s1++;
15607 s2++;
15608 }
15609 return 0;
15610}
15611
15612Py_UNICODE*
15613Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15614{
15615 const Py_UNICODE *p;
15616 for (p = s; *p; p++)
15617 if (*p == c)
15618 return (Py_UNICODE*)p;
15619 return NULL;
15620}
15621
15622Py_UNICODE*
15623Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15624{
15625 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015626 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015627 while (p != s) {
15628 p--;
15629 if (*p == c)
15630 return (Py_UNICODE*)p;
15631 }
15632 return NULL;
15633}
Victor Stinner331ea922010-08-10 16:37:20 +000015634
Victor Stinner71133ff2010-09-01 23:43:53 +000015635Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015636PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015637{
Victor Stinner577db2c2011-10-11 22:12:48 +020015638 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015639 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015640
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015641 if (!PyUnicode_Check(unicode)) {
15642 PyErr_BadArgument();
15643 return NULL;
15644 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015645 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015646 if (u == NULL)
15647 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015648 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015649 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015650 PyErr_NoMemory();
15651 return NULL;
15652 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015653 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015654 size *= sizeof(Py_UNICODE);
15655 copy = PyMem_Malloc(size);
15656 if (copy == NULL) {
15657 PyErr_NoMemory();
15658 return NULL;
15659 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015660 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015661 return copy;
15662}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015663
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015664
Victor Stinner709d23d2019-05-02 14:56:30 -040015665static int
15666encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015667{
Victor Stinner709d23d2019-05-02 14:56:30 -040015668 int res;
15669 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15670 if (res == -2) {
15671 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15672 return -1;
15673 }
15674 if (res < 0) {
15675 PyErr_NoMemory();
15676 return -1;
15677 }
15678 return 0;
15679}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015680
Victor Stinner709d23d2019-05-02 14:56:30 -040015681
15682static int
15683config_get_codec_name(wchar_t **config_encoding)
15684{
15685 char *encoding;
15686 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15687 return -1;
15688 }
15689
15690 PyObject *name_obj = NULL;
15691 PyObject *codec = _PyCodec_Lookup(encoding);
15692 PyMem_RawFree(encoding);
15693
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015694 if (!codec)
15695 goto error;
15696
15697 name_obj = PyObject_GetAttrString(codec, "name");
15698 Py_CLEAR(codec);
15699 if (!name_obj) {
15700 goto error;
15701 }
15702
Victor Stinner709d23d2019-05-02 14:56:30 -040015703 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15704 Py_DECREF(name_obj);
15705 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015706 goto error;
15707 }
15708
Victor Stinner709d23d2019-05-02 14:56:30 -040015709 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15710 if (raw_wname == NULL) {
15711 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015712 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015713 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015714 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015715
15716 PyMem_RawFree(*config_encoding);
15717 *config_encoding = raw_wname;
15718
15719 PyMem_Free(wname);
15720 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015721
15722error:
15723 Py_XDECREF(codec);
15724 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015725 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015726}
15727
15728
Victor Stinner331a6a52019-05-27 16:39:22 +020015729static PyStatus
Victor Stinnerc5c64252019-09-23 15:59:00 +020015730init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015731{
Victor Stinner709d23d2019-05-02 14:56:30 -040015732 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerc5c64252019-09-23 15:59:00 +020015733 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015734 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015735 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerc5c64252019-09-23 15:59:00 +020015736 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015737 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015738 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015739}
15740
15741
Victor Stinner709d23d2019-05-02 14:56:30 -040015742static int
15743init_fs_codec(PyInterpreterState *interp)
15744{
Victor Stinner331a6a52019-05-27 16:39:22 +020015745 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015746
15747 _Py_error_handler error_handler;
15748 error_handler = get_error_handler_wide(config->filesystem_errors);
15749 if (error_handler == _Py_ERROR_UNKNOWN) {
15750 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15751 return -1;
15752 }
15753
15754 char *encoding, *errors;
15755 if (encode_wstr_utf8(config->filesystem_encoding,
15756 &encoding,
15757 "filesystem_encoding") < 0) {
15758 return -1;
15759 }
15760
15761 if (encode_wstr_utf8(config->filesystem_errors,
15762 &errors,
15763 "filesystem_errors") < 0) {
15764 PyMem_RawFree(encoding);
15765 return -1;
15766 }
15767
15768 PyMem_RawFree(interp->fs_codec.encoding);
15769 interp->fs_codec.encoding = encoding;
15770 PyMem_RawFree(interp->fs_codec.errors);
15771 interp->fs_codec.errors = errors;
15772 interp->fs_codec.error_handler = error_handler;
15773
15774 /* At this point, PyUnicode_EncodeFSDefault() and
15775 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15776 the C implementation of the filesystem encoding. */
15777
15778 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15779 global configuration variables. */
15780 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15781 interp->fs_codec.errors) < 0) {
15782 PyErr_NoMemory();
15783 return -1;
15784 }
15785 return 0;
15786}
15787
15788
Victor Stinner331a6a52019-05-27 16:39:22 +020015789static PyStatus
Victor Stinnerc5c64252019-09-23 15:59:00 +020015790init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015791{
Victor Stinnerc5c64252019-09-23 15:59:00 +020015792 PyInterpreterState *interp = tstate->interp;
15793
Victor Stinner709d23d2019-05-02 14:56:30 -040015794 /* Update the filesystem encoding to the normalized Python codec name.
15795 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15796 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015797 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015798 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerc5c64252019-09-23 15:59:00 +020015799 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015800 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerc5c64252019-09-23 15:59:00 +020015801 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015802 }
15803
Victor Stinner709d23d2019-05-02 14:56:30 -040015804 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015805 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015806 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015807 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015808}
15809
15810
Victor Stinner331a6a52019-05-27 16:39:22 +020015811PyStatus
Victor Stinnerc5c64252019-09-23 15:59:00 +020015812_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015813{
Victor Stinnerc5c64252019-09-23 15:59:00 +020015814 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015815 if (_PyStatus_EXCEPTION(status)) {
15816 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015817 }
15818
Victor Stinnerc5c64252019-09-23 15:59:00 +020015819 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015820}
15821
15822
Victor Stinner709d23d2019-05-02 14:56:30 -040015823#ifdef MS_WINDOWS
15824int
15825_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15826{
15827 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015828 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015829
15830 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15831 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15832 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15833 if (encoding == NULL || errors == NULL) {
15834 PyMem_RawFree(encoding);
15835 PyMem_RawFree(errors);
15836 PyErr_NoMemory();
15837 return -1;
15838 }
15839
15840 PyMem_RawFree(config->filesystem_encoding);
15841 config->filesystem_encoding = encoding;
15842 PyMem_RawFree(config->filesystem_errors);
15843 config->filesystem_errors = errors;
15844
15845 return init_fs_codec(interp);
15846}
15847#endif
15848
15849
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015850void
15851_PyUnicode_Fini(void)
15852{
15853#if defined(WITH_VALGRIND) || defined(__INSURE__)
15854 /* Insure++ is a memory analysis tool that aids in discovering
15855 * memory leaks and other memory problems. On Python exit, the
15856 * interned string dictionaries are flagged as being in use at exit
15857 * (which it is). Under normal circumstances, this is fine because
15858 * the memory will be automatically reclaimed by the system. Under
15859 * memory debugging, it's a huge source of useless noise, so we
15860 * trade off slower shutdown for less distraction in the memory
15861 * reports. -baw
15862 */
15863 unicode_release_interned();
15864#endif /* __INSURE__ */
15865
15866 Py_CLEAR(unicode_empty);
15867
15868 for (Py_ssize_t i = 0; i < 256; i++) {
15869 Py_CLEAR(unicode_latin1[i]);
15870 }
15871 _PyUnicode_ClearStaticStrings();
15872 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015873
15874 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15875 PyMem_RawFree(interp->fs_codec.encoding);
15876 interp->fs_codec.encoding = NULL;
15877 PyMem_RawFree(interp->fs_codec.errors);
15878 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015879}
15880
15881
Georg Brandl66c221e2010-10-14 07:04:07 +000015882/* A _string module, to export formatter_parser and formatter_field_name_split
15883 to the string.Formatter class implemented in Python. */
15884
15885static PyMethodDef _string_methods[] = {
15886 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15887 METH_O, PyDoc_STR("split the argument as a field name")},
15888 {"formatter_parser", (PyCFunction) formatter_parser,
15889 METH_O, PyDoc_STR("parse the argument as a format string")},
15890 {NULL, NULL}
15891};
15892
15893static struct PyModuleDef _string_module = {
15894 PyModuleDef_HEAD_INIT,
15895 "_string",
15896 PyDoc_STR("string helper module"),
15897 0,
15898 _string_methods,
15899 NULL,
15900 NULL,
15901 NULL,
15902 NULL
15903};
15904
15905PyMODINIT_FUNC
15906PyInit__string(void)
15907{
15908 return PyModule_Create(&_string_module);
15909}
15910
15911
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015912#ifdef __cplusplus
15913}
15914#endif