blob: 132f57aa6c90001f63b6ffadfa2efc09e08fca03 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Victor Stinner709d23d2019-05-02 14:56:30 -0400268static PyObject *
269unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270 const char *errors);
271static PyObject *
272unicode_decode_utf8(const char *s, Py_ssize_t size,
273 _Py_error_handler error_handler, const char *errors,
274 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200276/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200277static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279/* Single character Unicode strings in the Latin-1 range are being
280 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200281static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282
Christian Heimes190d79e2008-01-30 11:58:22 +0000283/* Fast detection of the most frequent whitespace characters */
284const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000286/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000C: * FORM FEED */
290/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 1, 1, 1, 1, 1, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* case 0x001C: * FILE SEPARATOR */
294/* case 0x001D: * GROUP SEPARATOR */
295/* case 0x001E: * RECORD SEPARATOR */
296/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 1, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000303
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000312};
313
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200314/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200315static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100317static int unicode_modifiable(PyObject *unicode);
318
Victor Stinnerfe226c02011-10-03 03:52:20 +0200319
Alexander Belopolsky40018472011-02-26 01:02:56 +0000320static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100321_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200322static PyObject *
323_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
324static PyObject *
325_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
326
327static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000328unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000329 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100330 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
332
Alexander Belopolsky40018472011-02-26 01:02:56 +0000333static void
334raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300335 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100336 PyObject *unicode,
337 Py_ssize_t startpos, Py_ssize_t endpos,
338 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000339
Christian Heimes190d79e2008-01-30 11:58:22 +0000340/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200341static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000343/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000344/* 0x000B, * LINE TABULATION */
345/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000348 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x001C, * FILE SEPARATOR */
350/* 0x001D, * GROUP SEPARATOR */
351/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000352 0, 0, 0, 0, 1, 1, 1, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000357
Benjamin Peterson14339b62009-01-31 16:36:08 +0000358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000366};
367
INADA Naoki3ae20562017-01-16 20:41:20 +0900368static int convert_uc(PyObject *obj, void *addr);
369
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300370#include "clinic/unicodeobject.c.h"
371
Victor Stinner3d4226a2018-08-29 22:21:32 +0200372_Py_error_handler
373_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200374{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200376 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200385 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200394 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_OTHER;
397}
398
Victor Stinner709d23d2019-05-02 14:56:30 -0400399
400static _Py_error_handler
401get_error_handler_wide(const wchar_t *errors)
402{
403 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
404 return _Py_ERROR_STRICT;
405 }
406 if (wcscmp(errors, L"surrogateescape") == 0) {
407 return _Py_ERROR_SURROGATEESCAPE;
408 }
409 if (wcscmp(errors, L"replace") == 0) {
410 return _Py_ERROR_REPLACE;
411 }
412 if (wcscmp(errors, L"ignore") == 0) {
413 return _Py_ERROR_IGNORE;
414 }
415 if (wcscmp(errors, L"backslashreplace") == 0) {
416 return _Py_ERROR_BACKSLASHREPLACE;
417 }
418 if (wcscmp(errors, L"surrogatepass") == 0) {
419 return _Py_ERROR_SURROGATEPASS;
420 }
421 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
422 return _Py_ERROR_XMLCHARREFREPLACE;
423 }
424 return _Py_ERROR_OTHER;
425}
426
427
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300428/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
429 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000430Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000431PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000432{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000433#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000434 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000435#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000436 /* This is actually an illegal character, so it should
437 not be passed to unichr. */
438 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000439#endif
440}
441
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200442int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100443_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200444{
445 PyASCIIObject *ascii;
446 unsigned int kind;
447
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200448 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200449
450 ascii = (PyASCIIObject *)op;
451 kind = ascii->state.kind;
452
Victor Stinnera3b334d2011-10-03 13:53:37 +0200453 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200454 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
455 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200456 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200457 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200458 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200459 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200460
Victor Stinnera41463c2011-10-04 01:05:08 +0200461 if (ascii->state.compact == 1) {
462 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200463 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
464 || kind == PyUnicode_2BYTE_KIND
465 || kind == PyUnicode_4BYTE_KIND);
466 _PyObject_ASSERT(op, ascii->state.ascii == 0);
467 _PyObject_ASSERT(op, ascii->state.ready == 1);
468 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100469 }
470 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
472
473 data = unicode->data.any;
474 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200475 _PyObject_ASSERT(op, ascii->length == 0);
476 _PyObject_ASSERT(op, ascii->hash == -1);
477 _PyObject_ASSERT(op, ascii->state.compact == 0);
478 _PyObject_ASSERT(op, ascii->state.ascii == 0);
479 _PyObject_ASSERT(op, ascii->state.ready == 0);
480 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
481 _PyObject_ASSERT(op, ascii->wstr != NULL);
482 _PyObject_ASSERT(op, data == NULL);
483 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200484 }
485 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200486 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
487 || kind == PyUnicode_2BYTE_KIND
488 || kind == PyUnicode_4BYTE_KIND);
489 _PyObject_ASSERT(op, ascii->state.compact == 0);
490 _PyObject_ASSERT(op, ascii->state.ready == 1);
491 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200492 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200493 _PyObject_ASSERT(op, compact->utf8 == data);
494 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200495 }
496 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200497 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200498 }
499 }
500 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200501 if (
502#if SIZEOF_WCHAR_T == 2
503 kind == PyUnicode_2BYTE_KIND
504#else
505 kind == PyUnicode_4BYTE_KIND
506#endif
507 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200508 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200509 _PyObject_ASSERT(op, ascii->wstr == data);
510 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200511 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200512 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200513 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200514
515 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200516 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200518 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200519 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520
521 /* check that the best kind is used: O(n) operation */
522 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200523 Py_ssize_t i;
524 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200525 void *data;
526 Py_UCS4 ch;
527
528 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200529 for (i=0; i < ascii->length; i++)
530 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200531 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200532 if (ch > maxchar)
533 maxchar = ch;
534 }
535 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100536 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 _PyObject_ASSERT(op, maxchar >= 128);
538 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100539 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200540 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200541 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200542 }
Victor Stinner77faf692011-11-20 18:56:05 +0100543 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200544 _PyObject_ASSERT(op, maxchar >= 0x100);
545 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100546 }
547 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200548 _PyObject_ASSERT(op, maxchar >= 0x10000);
549 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100550 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200551 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200552 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400553 return 1;
554}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200555
Victor Stinner910337b2011-10-03 03:20:16 +0200556
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100557static PyObject*
558unicode_result_wchar(PyObject *unicode)
559{
560#ifndef Py_DEBUG
561 Py_ssize_t len;
562
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100563 len = _PyUnicode_WSTR_LENGTH(unicode);
564 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100565 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200566 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100567 }
568
569 if (len == 1) {
570 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100571 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
573 Py_DECREF(unicode);
574 return latin1_char;
575 }
576 }
577
578 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200579 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100580 return NULL;
581 }
582#else
Victor Stinneraa771272012-10-04 02:32:58 +0200583 assert(Py_REFCNT(unicode) == 1);
584
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100585 /* don't make the result ready in debug mode to ensure that the caller
586 makes the string ready before using it */
587 assert(_PyUnicode_CheckConsistency(unicode, 1));
588#endif
589 return unicode;
590}
591
592static PyObject*
593unicode_result_ready(PyObject *unicode)
594{
595 Py_ssize_t length;
596
597 length = PyUnicode_GET_LENGTH(unicode);
598 if (length == 0) {
599 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100600 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200601 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100602 }
603 return unicode_empty;
604 }
605
606 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200607 void *data = PyUnicode_DATA(unicode);
608 int kind = PyUnicode_KIND(unicode);
609 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100610 if (ch < 256) {
611 PyObject *latin1_char = unicode_latin1[ch];
612 if (latin1_char != NULL) {
613 if (unicode != latin1_char) {
614 Py_INCREF(latin1_char);
615 Py_DECREF(unicode);
616 }
617 return latin1_char;
618 }
619 else {
620 assert(_PyUnicode_CheckConsistency(unicode, 1));
621 Py_INCREF(unicode);
622 unicode_latin1[ch] = unicode;
623 return unicode;
624 }
625 }
626 }
627
628 assert(_PyUnicode_CheckConsistency(unicode, 1));
629 return unicode;
630}
631
632static PyObject*
633unicode_result(PyObject *unicode)
634{
635 assert(_PyUnicode_CHECK(unicode));
636 if (PyUnicode_IS_READY(unicode))
637 return unicode_result_ready(unicode);
638 else
639 return unicode_result_wchar(unicode);
640}
641
Victor Stinnerc4b49542011-12-11 22:44:26 +0100642static PyObject*
643unicode_result_unchanged(PyObject *unicode)
644{
645 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500646 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100647 return NULL;
648 Py_INCREF(unicode);
649 return unicode;
650 }
651 else
652 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100653 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100654}
655
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200656/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
657 ASCII, Latin1, UTF-8, etc. */
658static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200659backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200660 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
661{
Victor Stinnerad771582015-10-09 12:38:53 +0200662 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200663 Py_UCS4 ch;
664 enum PyUnicode_Kind kind;
665 void *data;
666
667 assert(PyUnicode_IS_READY(unicode));
668 kind = PyUnicode_KIND(unicode);
669 data = PyUnicode_DATA(unicode);
670
671 size = 0;
672 /* determine replacement size */
673 for (i = collstart; i < collend; ++i) {
674 Py_ssize_t incr;
675
676 ch = PyUnicode_READ(kind, data, i);
677 if (ch < 0x100)
678 incr = 2+2;
679 else if (ch < 0x10000)
680 incr = 2+4;
681 else {
682 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200683 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200684 }
685 if (size > PY_SSIZE_T_MAX - incr) {
686 PyErr_SetString(PyExc_OverflowError,
687 "encoded result is too long for a Python string");
688 return NULL;
689 }
690 size += incr;
691 }
692
Victor Stinnerad771582015-10-09 12:38:53 +0200693 str = _PyBytesWriter_Prepare(writer, str, size);
694 if (str == NULL)
695 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200696
697 /* generate replacement */
698 for (i = collstart; i < collend; ++i) {
699 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200700 *str++ = '\\';
701 if (ch >= 0x00010000) {
702 *str++ = 'U';
703 *str++ = Py_hexdigits[(ch>>28)&0xf];
704 *str++ = Py_hexdigits[(ch>>24)&0xf];
705 *str++ = Py_hexdigits[(ch>>20)&0xf];
706 *str++ = Py_hexdigits[(ch>>16)&0xf];
707 *str++ = Py_hexdigits[(ch>>12)&0xf];
708 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200709 }
Victor Stinner797485e2015-10-09 03:17:30 +0200710 else if (ch >= 0x100) {
711 *str++ = 'u';
712 *str++ = Py_hexdigits[(ch>>12)&0xf];
713 *str++ = Py_hexdigits[(ch>>8)&0xf];
714 }
715 else
716 *str++ = 'x';
717 *str++ = Py_hexdigits[(ch>>4)&0xf];
718 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200719 }
720 return str;
721}
722
723/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
724 ASCII, Latin1, UTF-8, etc. */
725static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200726xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200727 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
728{
Victor Stinnerad771582015-10-09 12:38:53 +0200729 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200730 Py_UCS4 ch;
731 enum PyUnicode_Kind kind;
732 void *data;
733
734 assert(PyUnicode_IS_READY(unicode));
735 kind = PyUnicode_KIND(unicode);
736 data = PyUnicode_DATA(unicode);
737
738 size = 0;
739 /* determine replacement size */
740 for (i = collstart; i < collend; ++i) {
741 Py_ssize_t incr;
742
743 ch = PyUnicode_READ(kind, data, i);
744 if (ch < 10)
745 incr = 2+1+1;
746 else if (ch < 100)
747 incr = 2+2+1;
748 else if (ch < 1000)
749 incr = 2+3+1;
750 else if (ch < 10000)
751 incr = 2+4+1;
752 else if (ch < 100000)
753 incr = 2+5+1;
754 else if (ch < 1000000)
755 incr = 2+6+1;
756 else {
757 assert(ch <= MAX_UNICODE);
758 incr = 2+7+1;
759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
775 }
776 return str;
777}
778
Thomas Wouters477c8d52006-05-27 19:21:47 +0000779/* --- Bloom Filters ----------------------------------------------------- */
780
781/* stuff to implement simple "bloom filters" for Unicode characters.
782 to keep things simple, we use a single bitmask, using the least 5
783 bits from each unicode characters as the bit index. */
784
785/* the linebreak mask is set up by Unicode_Init below */
786
Antoine Pitrouf068f942010-01-13 14:19:12 +0000787#if LONG_BIT >= 128
788#define BLOOM_WIDTH 128
789#elif LONG_BIT >= 64
790#define BLOOM_WIDTH 64
791#elif LONG_BIT >= 32
792#define BLOOM_WIDTH 32
793#else
794#error "LONG_BIT is smaller than 32"
795#endif
796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797#define BLOOM_MASK unsigned long
798
Serhiy Storchaka05997252013-01-26 12:14:02 +0200799static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000800
Antoine Pitrouf068f942010-01-13 14:19:12 +0000801#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802
Benjamin Peterson29060642009-01-31 22:14:21 +0000803#define BLOOM_LINEBREAK(ch) \
804 ((ch) < 128U ? ascii_linebreak[(ch)] : \
805 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700807static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000809{
Victor Stinnera85af502013-04-09 21:53:54 +0200810#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
811 do { \
812 TYPE *data = (TYPE *)PTR; \
813 TYPE *end = data + LEN; \
814 Py_UCS4 ch; \
815 for (; data != end; data++) { \
816 ch = *data; \
817 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
818 } \
819 break; \
820 } while (0)
821
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 /* calculate simple bloom-style bitmask for a given unicode string */
823
Antoine Pitrouf068f942010-01-13 14:19:12 +0000824 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000825
826 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200827 switch (kind) {
828 case PyUnicode_1BYTE_KIND:
829 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
830 break;
831 case PyUnicode_2BYTE_KIND:
832 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
833 break;
834 case PyUnicode_4BYTE_KIND:
835 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
836 break;
837 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700838 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200839 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000840 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200841
842#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000843}
844
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300845static int
846ensure_unicode(PyObject *obj)
847{
848 if (!PyUnicode_Check(obj)) {
849 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200850 "must be str, not %.100s",
851 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300852 return -1;
853 }
854 return PyUnicode_READY(obj);
855}
856
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857/* Compilation of templated routines */
858
859#include "stringlib/asciilib.h"
860#include "stringlib/fastsearch.h"
861#include "stringlib/partition.h"
862#include "stringlib/split.h"
863#include "stringlib/count.h"
864#include "stringlib/find.h"
865#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200866#include "stringlib/undef.h"
867
868#include "stringlib/ucs1lib.h"
869#include "stringlib/fastsearch.h"
870#include "stringlib/partition.h"
871#include "stringlib/split.h"
872#include "stringlib/count.h"
873#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300874#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200875#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200876#include "stringlib/undef.h"
877
878#include "stringlib/ucs2lib.h"
879#include "stringlib/fastsearch.h"
880#include "stringlib/partition.h"
881#include "stringlib/split.h"
882#include "stringlib/count.h"
883#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300884#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200885#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200886#include "stringlib/undef.h"
887
888#include "stringlib/ucs4lib.h"
889#include "stringlib/fastsearch.h"
890#include "stringlib/partition.h"
891#include "stringlib/split.h"
892#include "stringlib/count.h"
893#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300894#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200895#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200896#include "stringlib/undef.h"
897
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200898#include "stringlib/unicodedefs.h"
899#include "stringlib/fastsearch.h"
900#include "stringlib/count.h"
901#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100902#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200903
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904/* --- Unicode Object ----------------------------------------------------- */
905
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700906static inline Py_ssize_t
907findchar(const void *s, int kind,
908 Py_ssize_t size, Py_UCS4 ch,
909 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200911 switch (kind) {
912 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200913 if ((Py_UCS1) ch != ch)
914 return -1;
915 if (direction > 0)
916 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
917 else
918 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200919 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200920 if ((Py_UCS2) ch != ch)
921 return -1;
922 if (direction > 0)
923 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
924 else
925 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200926 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200927 if (direction > 0)
928 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
929 else
930 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200931 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700932 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934}
935
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000937/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200938 earlier.
939
940 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
941 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
942 invalid character in Unicode 6.0. */
943static void
944unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
945{
946 int kind = PyUnicode_KIND(unicode);
947 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
948 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
949 if (length <= old_length)
950 return;
951 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
952}
953#endif
954
Victor Stinnerfe226c02011-10-03 03:52:20 +0200955static PyObject*
956resize_compact(PyObject *unicode, Py_ssize_t length)
957{
958 Py_ssize_t char_size;
959 Py_ssize_t struct_size;
960 Py_ssize_t new_size;
961 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100962 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200963#ifdef Py_DEBUG
964 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
965#endif
966
Victor Stinner79891572012-05-03 13:43:07 +0200967 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100969 assert(PyUnicode_IS_COMPACT(unicode));
970
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200971 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100972 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 struct_size = sizeof(PyASCIIObject);
974 else
975 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200976 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
979 PyErr_NoMemory();
980 return NULL;
981 }
982 new_size = (struct_size + (length + 1) * char_size);
983
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200984 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
985 PyObject_DEL(_PyUnicode_UTF8(unicode));
986 _PyUnicode_UTF8(unicode) = NULL;
987 _PyUnicode_UTF8_LENGTH(unicode) = 0;
988 }
Victor Stinner84def372011-12-11 20:04:56 +0100989 _Py_DEC_REFTOTAL;
990 _Py_ForgetReference(unicode);
991
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300992 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100993 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100994 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 PyErr_NoMemory();
996 return NULL;
997 }
Victor Stinner84def372011-12-11 20:04:56 +0100998 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001000
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001004 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001005 _PyUnicode_WSTR_LENGTH(unicode) = length;
1006 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001007 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1008 PyObject_DEL(_PyUnicode_WSTR(unicode));
1009 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001010 if (!PyUnicode_IS_ASCII(unicode))
1011 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001012 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001013#ifdef Py_DEBUG
1014 unicode_fill_invalid(unicode, old_length);
1015#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001016 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1017 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 return unicode;
1020}
1021
Alexander Belopolsky40018472011-02-26 01:02:56 +00001022static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001023resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024{
Victor Stinner95663112011-10-04 01:03:50 +02001025 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001026 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 if (PyUnicode_IS_READY(unicode)) {
1031 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001032 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1036#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001037
1038 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001039 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001040 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001042
1043 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1044 PyErr_NoMemory();
1045 return -1;
1046 }
1047 new_size = (length + 1) * char_size;
1048
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1050 {
1051 PyObject_DEL(_PyUnicode_UTF8(unicode));
1052 _PyUnicode_UTF8(unicode) = NULL;
1053 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1054 }
1055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 data = (PyObject *)PyObject_REALLOC(data, new_size);
1057 if (data == NULL) {
1058 PyErr_NoMemory();
1059 return -1;
1060 }
1061 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001062 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 _PyUnicode_WSTR_LENGTH(unicode) = length;
1065 }
1066 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001067 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001068 _PyUnicode_UTF8_LENGTH(unicode) = length;
1069 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 _PyUnicode_LENGTH(unicode) = length;
1071 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001072#ifdef Py_DEBUG
1073 unicode_fill_invalid(unicode, old_length);
1074#endif
Victor Stinner95663112011-10-04 01:03:50 +02001075 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001076 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 }
Victor Stinner95663112011-10-04 01:03:50 +02001080 assert(_PyUnicode_WSTR(unicode) != NULL);
1081
1082 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001083 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001084 PyErr_NoMemory();
1085 return -1;
1086 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001087 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001088 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001089 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001090 if (!wstr) {
1091 PyErr_NoMemory();
1092 return -1;
1093 }
1094 _PyUnicode_WSTR(unicode) = wstr;
1095 _PyUnicode_WSTR(unicode)[length] = 0;
1096 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 return 0;
1099}
1100
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101static PyObject*
1102resize_copy(PyObject *unicode, Py_ssize_t length)
1103{
1104 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001106 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001107
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001108 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109
1110 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1111 if (copy == NULL)
1112 return NULL;
1113
1114 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001115 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001117 }
1118 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001119 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001120
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001121 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 if (w == NULL)
1123 return NULL;
1124 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1125 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001126 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001127 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001128 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 }
1130}
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001133 Ux0000 terminated; some code (e.g. new_identifier)
1134 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135
1136 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001137 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139*/
1140
Alexander Belopolsky40018472011-02-26 01:02:56 +00001141static PyUnicodeObject *
1142_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001144 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146
Thomas Wouters477c8d52006-05-27 19:21:47 +00001147 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 if (length == 0 && unicode_empty != NULL) {
1149 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001150 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 }
1152
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001153 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001154 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001155 return (PyUnicodeObject *)PyErr_NoMemory();
1156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157 if (length < 0) {
1158 PyErr_SetString(PyExc_SystemError,
1159 "Negative size passed to _PyUnicode_New");
1160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 }
1162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1164 if (unicode == NULL)
1165 return NULL;
1166 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001167
1168 _PyUnicode_WSTR_LENGTH(unicode) = length;
1169 _PyUnicode_HASH(unicode) = -1;
1170 _PyUnicode_STATE(unicode).interned = 0;
1171 _PyUnicode_STATE(unicode).kind = 0;
1172 _PyUnicode_STATE(unicode).compact = 0;
1173 _PyUnicode_STATE(unicode).ready = 0;
1174 _PyUnicode_STATE(unicode).ascii = 0;
1175 _PyUnicode_DATA_ANY(unicode) = NULL;
1176 _PyUnicode_LENGTH(unicode) = 0;
1177 _PyUnicode_UTF8(unicode) = NULL;
1178 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1181 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001182 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001183 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001184 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186
Jeremy Hyltond8082792003-09-16 19:41:39 +00001187 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001188 * the caller fails before initializing str -- unicode_resize()
1189 * reads str[0], and the Keep-Alive optimization can keep memory
1190 * allocated for str alive across a call to unicode_dealloc(unicode).
1191 * We don't want unicode_resize to read uninitialized memory in
1192 * that case.
1193 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 _PyUnicode_WSTR(unicode)[0] = 0;
1195 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001196
Victor Stinner7931d9a2011-11-04 00:22:48 +01001197 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 return unicode;
1199}
1200
Victor Stinnerf42dc442011-10-02 23:33:16 +02001201static const char*
1202unicode_kind_name(PyObject *unicode)
1203{
Victor Stinner42dfd712011-10-03 14:41:45 +02001204 /* don't check consistency: unicode_kind_name() is called from
1205 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001206 if (!PyUnicode_IS_COMPACT(unicode))
1207 {
1208 if (!PyUnicode_IS_READY(unicode))
1209 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001210 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001211 {
1212 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001213 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 return "legacy ascii";
1215 else
1216 return "legacy latin1";
1217 case PyUnicode_2BYTE_KIND:
1218 return "legacy UCS2";
1219 case PyUnicode_4BYTE_KIND:
1220 return "legacy UCS4";
1221 default:
1222 return "<legacy invalid kind>";
1223 }
1224 }
1225 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001226 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001227 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001228 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001229 return "ascii";
1230 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001231 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001232 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001233 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001234 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001235 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001236 default:
1237 return "<invalid compact kind>";
1238 }
1239}
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001243char *_PyUnicode_utf8(void *unicode_raw){
1244 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001245 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246}
1247
Victor Stinnera42de742018-11-22 10:25:22 +01001248void *_PyUnicode_compact_data(void *unicode_raw) {
1249 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 return _PyUnicode_COMPACT_DATA(unicode);
1251}
Victor Stinnera42de742018-11-22 10:25:22 +01001252void *_PyUnicode_data(void *unicode_raw) {
1253 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001254 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1256 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1257 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1258 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1259 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1260 return PyUnicode_DATA(unicode);
1261}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001262
1263void
1264_PyUnicode_Dump(PyObject *op)
1265{
1266 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001267 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1268 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1269 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001270
Victor Stinnera849a4b2011-10-03 12:12:11 +02001271 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001272 {
1273 if (ascii->state.ascii)
1274 data = (ascii + 1);
1275 else
1276 data = (compact + 1);
1277 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001278 else
1279 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001280 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1281 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001282
Victor Stinnera849a4b2011-10-03 12:12:11 +02001283 if (ascii->wstr == data)
1284 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001285 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001286
Victor Stinnera3b334d2011-10-03 13:53:37 +02001287 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001288 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001289 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1290 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001291 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001292 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001293 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001294 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001295}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#endif
1297
1298PyObject *
1299PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1300{
1301 PyObject *obj;
1302 PyCompactUnicodeObject *unicode;
1303 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001304 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001305 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 Py_ssize_t char_size;
1307 Py_ssize_t struct_size;
1308
1309 /* Optimization for empty strings */
1310 if (size == 0 && unicode_empty != NULL) {
1311 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001312 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 }
1314
Victor Stinner9e9d6892011-10-04 01:02:02 +02001315 is_ascii = 0;
1316 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 struct_size = sizeof(PyCompactUnicodeObject);
1318 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001319 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 char_size = 1;
1321 is_ascii = 1;
1322 struct_size = sizeof(PyASCIIObject);
1323 }
1324 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001325 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 char_size = 1;
1327 }
1328 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001329 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 char_size = 2;
1331 if (sizeof(wchar_t) == 2)
1332 is_sharing = 1;
1333 }
1334 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001335 if (maxchar > MAX_UNICODE) {
1336 PyErr_SetString(PyExc_SystemError,
1337 "invalid maximum character passed to PyUnicode_New");
1338 return NULL;
1339 }
Victor Stinner8f825062012-04-27 13:55:39 +02001340 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 char_size = 4;
1342 if (sizeof(wchar_t) == 4)
1343 is_sharing = 1;
1344 }
1345
1346 /* Ensure we won't overflow the size. */
1347 if (size < 0) {
1348 PyErr_SetString(PyExc_SystemError,
1349 "Negative size passed to PyUnicode_New");
1350 return NULL;
1351 }
1352 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1353 return PyErr_NoMemory();
1354
1355 /* Duplicated allocation code from _PyObject_New() instead of a call to
1356 * PyObject_New() so we are able to allocate space for the object and
1357 * it's data buffer.
1358 */
1359 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1360 if (obj == NULL)
1361 return PyErr_NoMemory();
1362 obj = PyObject_INIT(obj, &PyUnicode_Type);
1363 if (obj == NULL)
1364 return NULL;
1365
1366 unicode = (PyCompactUnicodeObject *)obj;
1367 if (is_ascii)
1368 data = ((PyASCIIObject*)obj) + 1;
1369 else
1370 data = unicode + 1;
1371 _PyUnicode_LENGTH(unicode) = size;
1372 _PyUnicode_HASH(unicode) = -1;
1373 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001374 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 _PyUnicode_STATE(unicode).compact = 1;
1376 _PyUnicode_STATE(unicode).ready = 1;
1377 _PyUnicode_STATE(unicode).ascii = is_ascii;
1378 if (is_ascii) {
1379 ((char*)data)[size] = 0;
1380 _PyUnicode_WSTR(unicode) = NULL;
1381 }
Victor Stinner8f825062012-04-27 13:55:39 +02001382 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 ((char*)data)[size] = 0;
1384 _PyUnicode_WSTR(unicode) = NULL;
1385 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001387 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 else {
1390 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001391 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001392 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001394 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 ((Py_UCS4*)data)[size] = 0;
1396 if (is_sharing) {
1397 _PyUnicode_WSTR_LENGTH(unicode) = size;
1398 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1399 }
1400 else {
1401 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1402 _PyUnicode_WSTR(unicode) = NULL;
1403 }
1404 }
Victor Stinner8f825062012-04-27 13:55:39 +02001405#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001406 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001407#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001408 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 return obj;
1410}
1411
1412#if SIZEOF_WCHAR_T == 2
1413/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1414 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001415 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416
1417 This function assumes that unicode can hold one more code point than wstr
1418 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001419static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001421 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422{
1423 const wchar_t *iter;
1424 Py_UCS4 *ucs4_out;
1425
Victor Stinner910337b2011-10-03 03:20:16 +02001426 assert(unicode != NULL);
1427 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1429 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1430
1431 for (iter = begin; iter < end; ) {
1432 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1433 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001434 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1435 && (iter+1) < end
1436 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 {
Victor Stinner551ac952011-11-29 22:58:13 +01001438 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 iter += 2;
1440 }
1441 else {
1442 *ucs4_out++ = *iter;
1443 iter++;
1444 }
1445 }
1446 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1447 _PyUnicode_GET_LENGTH(unicode)));
1448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449}
1450#endif
1451
Victor Stinnercd9950f2011-10-02 00:34:53 +02001452static int
Victor Stinner488fa492011-12-12 00:01:39 +01001453unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001454{
Victor Stinner488fa492011-12-12 00:01:39 +01001455 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001456 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001457 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001458 return -1;
1459 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001460 return 0;
1461}
1462
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001463static int
1464_copy_characters(PyObject *to, Py_ssize_t to_start,
1465 PyObject *from, Py_ssize_t from_start,
1466 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001468 unsigned int from_kind, to_kind;
1469 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
Victor Stinneree4544c2012-05-09 22:24:08 +02001471 assert(0 <= how_many);
1472 assert(0 <= from_start);
1473 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001474 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001475 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001476 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477
Victor Stinnerd3f08822012-05-29 12:57:52 +02001478 assert(PyUnicode_Check(to));
1479 assert(PyUnicode_IS_READY(to));
1480 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1481
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001482 if (how_many == 0)
1483 return 0;
1484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001486 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489
Victor Stinnerf1852262012-06-16 16:38:26 +02001490#ifdef Py_DEBUG
1491 if (!check_maxchar
1492 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1493 {
1494 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1495 Py_UCS4 ch;
1496 Py_ssize_t i;
1497 for (i=0; i < how_many; i++) {
1498 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1499 assert(ch <= to_maxchar);
1500 }
1501 }
1502#endif
1503
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001504 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001505 if (check_maxchar
1506 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1507 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001508 /* Writing Latin-1 characters into an ASCII string requires to
1509 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 Py_UCS4 max_char;
1511 max_char = ucs1lib_find_max_char(from_data,
1512 (Py_UCS1*)from_data + how_many);
1513 if (max_char >= 128)
1514 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 }
Christian Heimesf051e432016-09-13 20:22:02 +02001516 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001517 (char*)from_data + from_kind * from_start,
1518 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 else if (from_kind == PyUnicode_1BYTE_KIND
1521 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001522 {
1523 _PyUnicode_CONVERT_BYTES(
1524 Py_UCS1, Py_UCS2,
1525 PyUnicode_1BYTE_DATA(from) + from_start,
1526 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1527 PyUnicode_2BYTE_DATA(to) + to_start
1528 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001529 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001530 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001531 && to_kind == PyUnicode_4BYTE_KIND)
1532 {
1533 _PyUnicode_CONVERT_BYTES(
1534 Py_UCS1, Py_UCS4,
1535 PyUnicode_1BYTE_DATA(from) + from_start,
1536 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1537 PyUnicode_4BYTE_DATA(to) + to_start
1538 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001539 }
1540 else if (from_kind == PyUnicode_2BYTE_KIND
1541 && to_kind == PyUnicode_4BYTE_KIND)
1542 {
1543 _PyUnicode_CONVERT_BYTES(
1544 Py_UCS2, Py_UCS4,
1545 PyUnicode_2BYTE_DATA(from) + from_start,
1546 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1547 PyUnicode_4BYTE_DATA(to) + to_start
1548 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001549 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001550 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001551 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1552
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001553 if (!check_maxchar) {
1554 if (from_kind == PyUnicode_2BYTE_KIND
1555 && to_kind == PyUnicode_1BYTE_KIND)
1556 {
1557 _PyUnicode_CONVERT_BYTES(
1558 Py_UCS2, Py_UCS1,
1559 PyUnicode_2BYTE_DATA(from) + from_start,
1560 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1561 PyUnicode_1BYTE_DATA(to) + to_start
1562 );
1563 }
1564 else if (from_kind == PyUnicode_4BYTE_KIND
1565 && to_kind == PyUnicode_1BYTE_KIND)
1566 {
1567 _PyUnicode_CONVERT_BYTES(
1568 Py_UCS4, Py_UCS1,
1569 PyUnicode_4BYTE_DATA(from) + from_start,
1570 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1571 PyUnicode_1BYTE_DATA(to) + to_start
1572 );
1573 }
1574 else if (from_kind == PyUnicode_4BYTE_KIND
1575 && to_kind == PyUnicode_2BYTE_KIND)
1576 {
1577 _PyUnicode_CONVERT_BYTES(
1578 Py_UCS4, Py_UCS2,
1579 PyUnicode_4BYTE_DATA(from) + from_start,
1580 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1581 PyUnicode_2BYTE_DATA(to) + to_start
1582 );
1583 }
1584 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001585 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001586 }
1587 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001588 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001589 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001590 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001591 Py_ssize_t i;
1592
Victor Stinnera0702ab2011-09-29 14:14:38 +02001593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 if (ch > to_maxchar)
1596 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001597 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1598 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001599 }
1600 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601 return 0;
1602}
1603
Victor Stinnerd3f08822012-05-29 12:57:52 +02001604void
1605_PyUnicode_FastCopyCharacters(
1606 PyObject *to, Py_ssize_t to_start,
1607 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608{
1609 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1610}
1611
1612Py_ssize_t
1613PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1614 PyObject *from, Py_ssize_t from_start,
1615 Py_ssize_t how_many)
1616{
1617 int err;
1618
1619 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1620 PyErr_BadInternalCall();
1621 return -1;
1622 }
1623
Benjamin Petersonbac79492012-01-14 13:34:47 -05001624 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001625 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001626 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001627 return -1;
1628
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001629 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001630 PyErr_SetString(PyExc_IndexError, "string index out of range");
1631 return -1;
1632 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001633 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001634 PyErr_SetString(PyExc_IndexError, "string index out of range");
1635 return -1;
1636 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001637 if (how_many < 0) {
1638 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1639 return -1;
1640 }
1641 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001642 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1643 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001644 "Cannot write %zi characters at %zi "
1645 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001646 how_many, to_start, PyUnicode_GET_LENGTH(to));
1647 return -1;
1648 }
1649
1650 if (how_many == 0)
1651 return 0;
1652
Victor Stinner488fa492011-12-12 00:01:39 +01001653 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001654 return -1;
1655
1656 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1657 if (err) {
1658 PyErr_Format(PyExc_SystemError,
1659 "Cannot copy %s characters "
1660 "into a string of %s characters",
1661 unicode_kind_name(from),
1662 unicode_kind_name(to));
1663 return -1;
1664 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001665 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666}
1667
Victor Stinner17222162011-09-28 22:15:37 +02001668/* Find the maximum code point and count the number of surrogate pairs so a
1669 correct string length can be computed before converting a string to UCS4.
1670 This function counts single surrogates as a character and not as a pair.
1671
1672 Return 0 on success, or -1 on error. */
1673static int
1674find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1675 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676{
1677 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001678 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679
Victor Stinnerc53be962011-10-02 21:33:54 +02001680 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 *num_surrogates = 0;
1682 *maxchar = 0;
1683
1684 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001686 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1687 && (iter+1) < end
1688 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1689 {
1690 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1691 ++(*num_surrogates);
1692 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001696 {
1697 ch = *iter;
1698 iter++;
1699 }
1700 if (ch > *maxchar) {
1701 *maxchar = ch;
1702 if (*maxchar > MAX_UNICODE) {
1703 PyErr_Format(PyExc_ValueError,
1704 "character U+%x is not in range [U+0000; U+10ffff]",
1705 ch);
1706 return -1;
1707 }
1708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 }
1710 return 0;
1711}
1712
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001713int
1714_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715{
1716 wchar_t *end;
1717 Py_UCS4 maxchar = 0;
1718 Py_ssize_t num_surrogates;
1719#if SIZEOF_WCHAR_T == 2
1720 Py_ssize_t length_wo_surrogates;
1721#endif
1722
Georg Brandl7597add2011-10-05 16:36:47 +02001723 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001724 strings were created using _PyObject_New() and where no canonical
1725 representation (the str field) has been set yet aka strings
1726 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001727 assert(_PyUnicode_CHECK(unicode));
1728 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001730 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001731 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001732 /* Actually, it should neither be interned nor be anything else: */
1733 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001736 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001737 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739
1740 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001741 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1742 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 PyErr_NoMemory();
1744 return -1;
1745 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001746 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 _PyUnicode_WSTR(unicode), end,
1748 PyUnicode_1BYTE_DATA(unicode));
1749 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1750 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1751 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1752 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001753 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001754 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001755 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 }
1757 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001758 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001759 _PyUnicode_UTF8(unicode) = NULL;
1760 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 }
1762 PyObject_FREE(_PyUnicode_WSTR(unicode));
1763 _PyUnicode_WSTR(unicode) = NULL;
1764 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1765 }
1766 /* In this case we might have to convert down from 4-byte native
1767 wchar_t to 2-byte unicode. */
1768 else if (maxchar < 65536) {
1769 assert(num_surrogates == 0 &&
1770 "FindMaxCharAndNumSurrogatePairs() messed up");
1771
Victor Stinner506f5922011-09-28 22:34:18 +02001772#if SIZEOF_WCHAR_T == 2
1773 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001775 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1776 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1777 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001778 _PyUnicode_UTF8(unicode) = NULL;
1779 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001780#else
1781 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001782 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001783 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001784 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001785 PyErr_NoMemory();
1786 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 }
Victor Stinner506f5922011-09-28 22:34:18 +02001788 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1789 _PyUnicode_WSTR(unicode), end,
1790 PyUnicode_2BYTE_DATA(unicode));
1791 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1793 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001796 PyObject_FREE(_PyUnicode_WSTR(unicode));
1797 _PyUnicode_WSTR(unicode) = NULL;
1798 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1799#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 }
1801 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1802 else {
1803#if SIZEOF_WCHAR_T == 2
1804 /* in case the native representation is 2-bytes, we need to allocate a
1805 new normalized 4-byte version. */
1806 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001807 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1808 PyErr_NoMemory();
1809 return -1;
1810 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001811 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1812 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 PyErr_NoMemory();
1814 return -1;
1815 }
1816 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1817 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001818 _PyUnicode_UTF8(unicode) = NULL;
1819 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001820 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1821 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001822 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 PyObject_FREE(_PyUnicode_WSTR(unicode));
1824 _PyUnicode_WSTR(unicode) = NULL;
1825 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1826#else
1827 assert(num_surrogates == 0);
1828
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 _PyUnicode_UTF8(unicode) = NULL;
1832 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1834#endif
1835 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1836 }
1837 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001838 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 return 0;
1840}
1841
Alexander Belopolsky40018472011-02-26 01:02:56 +00001842static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001843unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844{
Walter Dörwald16807132007-05-25 13:52:07 +00001845 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 case SSTATE_NOT_INTERNED:
1847 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001848
Benjamin Peterson29060642009-01-31 22:14:21 +00001849 case SSTATE_INTERNED_MORTAL:
1850 /* revive dead object temporarily for DelItem */
1851 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001852 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001853 Py_FatalError(
1854 "deletion of interned string failed");
1855 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001856
Benjamin Peterson29060642009-01-31 22:14:21 +00001857 case SSTATE_INTERNED_IMMORTAL:
1858 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001859 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001860
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 default:
1862 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001863 }
1864
Victor Stinner03490912011-10-03 23:45:12 +02001865 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001867 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001868 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1870 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001872 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873}
1874
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001875#ifdef Py_DEBUG
1876static int
1877unicode_is_singleton(PyObject *unicode)
1878{
1879 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1880 if (unicode == unicode_empty)
1881 return 1;
1882 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1883 {
1884 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1885 if (ch < 256 && unicode_latin1[ch] == unicode)
1886 return 1;
1887 }
1888 return 0;
1889}
1890#endif
1891
Alexander Belopolsky40018472011-02-26 01:02:56 +00001892static int
Victor Stinner488fa492011-12-12 00:01:39 +01001893unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001894{
Victor Stinner488fa492011-12-12 00:01:39 +01001895 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001896 if (Py_REFCNT(unicode) != 1)
1897 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001898 if (_PyUnicode_HASH(unicode) != -1)
1899 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001900 if (PyUnicode_CHECK_INTERNED(unicode))
1901 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001902 if (!PyUnicode_CheckExact(unicode))
1903 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001904#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001905 /* singleton refcount is greater than 1 */
1906 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001907#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001908 return 1;
1909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Victor Stinnerfe226c02011-10-03 03:52:20 +02001911static int
1912unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1913{
1914 PyObject *unicode;
1915 Py_ssize_t old_length;
1916
1917 assert(p_unicode != NULL);
1918 unicode = *p_unicode;
1919
1920 assert(unicode != NULL);
1921 assert(PyUnicode_Check(unicode));
1922 assert(0 <= length);
1923
Victor Stinner910337b2011-10-03 03:20:16 +02001924 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001925 old_length = PyUnicode_WSTR_LENGTH(unicode);
1926 else
1927 old_length = PyUnicode_GET_LENGTH(unicode);
1928 if (old_length == length)
1929 return 0;
1930
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001931 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001932 _Py_INCREF_UNICODE_EMPTY();
1933 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001935 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001936 return 0;
1937 }
1938
Victor Stinner488fa492011-12-12 00:01:39 +01001939 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001940 PyObject *copy = resize_copy(unicode, length);
1941 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001943 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001945 }
1946
Victor Stinnerfe226c02011-10-03 03:52:20 +02001947 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001948 PyObject *new_unicode = resize_compact(unicode, length);
1949 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001950 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001951 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001953 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001954 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001955}
1956
Alexander Belopolsky40018472011-02-26 01:02:56 +00001957int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001958PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001959{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001960 PyObject *unicode;
1961 if (p_unicode == NULL) {
1962 PyErr_BadInternalCall();
1963 return -1;
1964 }
1965 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001966 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 {
1968 PyErr_BadInternalCall();
1969 return -1;
1970 }
1971 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001972}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001973
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001974/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001975
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001976 WARNING: The function doesn't copy the terminating null character and
1977 doesn't check the maximum character (may write a latin1 character in an
1978 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001979static void
1980unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1981 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001982{
1983 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1984 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001985 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001986
1987 switch (kind) {
1988 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001989 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001990#ifdef Py_DEBUG
1991 if (PyUnicode_IS_ASCII(unicode)) {
1992 Py_UCS4 maxchar = ucs1lib_find_max_char(
1993 (const Py_UCS1*)str,
1994 (const Py_UCS1*)str + len);
1995 assert(maxchar < 128);
1996 }
1997#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001998 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001999 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002000 }
2001 case PyUnicode_2BYTE_KIND: {
2002 Py_UCS2 *start = (Py_UCS2 *)data + index;
2003 Py_UCS2 *ucs2 = start;
2004 assert(index <= PyUnicode_GET_LENGTH(unicode));
2005
Victor Stinner184252a2012-06-16 02:57:41 +02002006 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002007 *ucs2 = (Py_UCS2)*str;
2008
2009 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002010 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002011 }
2012 default: {
2013 Py_UCS4 *start = (Py_UCS4 *)data + index;
2014 Py_UCS4 *ucs4 = start;
2015 assert(kind == PyUnicode_4BYTE_KIND);
2016 assert(index <= PyUnicode_GET_LENGTH(unicode));
2017
Victor Stinner184252a2012-06-16 02:57:41 +02002018 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002019 *ucs4 = (Py_UCS4)*str;
2020
2021 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002022 }
2023 }
2024}
2025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026static PyObject*
2027get_latin1_char(unsigned char ch)
2028{
Victor Stinnera464fc12011-10-02 20:39:30 +02002029 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002031 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (!unicode)
2033 return NULL;
2034 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002035 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 unicode_latin1[ch] = unicode;
2037 }
2038 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002039 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040}
2041
Victor Stinner985a82a2014-01-03 12:53:47 +01002042static PyObject*
2043unicode_char(Py_UCS4 ch)
2044{
2045 PyObject *unicode;
2046
2047 assert(ch <= MAX_UNICODE);
2048
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002049 if (ch < 256)
2050 return get_latin1_char(ch);
2051
Victor Stinner985a82a2014-01-03 12:53:47 +01002052 unicode = PyUnicode_New(1, ch);
2053 if (unicode == NULL)
2054 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002055
2056 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2057 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002058 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002059 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002060 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2061 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2062 }
2063 assert(_PyUnicode_CheckConsistency(unicode, 1));
2064 return unicode;
2065}
2066
Alexander Belopolsky40018472011-02-26 01:02:56 +00002067PyObject *
2068PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002070 if (u == NULL)
2071 return (PyObject*)_PyUnicode_New(size);
2072
2073 if (size < 0) {
2074 PyErr_BadInternalCall();
2075 return NULL;
2076 }
2077
2078 return PyUnicode_FromWideChar(u, size);
2079}
2080
2081PyObject *
2082PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2083{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002084 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 Py_UCS4 maxchar = 0;
2086 Py_ssize_t num_surrogates;
2087
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002088 if (u == NULL && size != 0) {
2089 PyErr_BadInternalCall();
2090 return NULL;
2091 }
2092
2093 if (size == -1) {
2094 size = wcslen(u);
2095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002097 /* If the Unicode data is known at construction time, we can apply
2098 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002101 if (size == 0)
2102 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 /* Single character Unicode objects in the Latin-1 range are
2105 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002106 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 return get_latin1_char((unsigned char)*u);
2108
2109 /* If not empty and not single character, copy the Unicode data
2110 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002111 if (find_maxchar_surrogates(u, u + size,
2112 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 return NULL;
2114
Victor Stinner8faf8212011-12-08 22:14:11 +01002115 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 if (!unicode)
2117 return NULL;
2118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 switch (PyUnicode_KIND(unicode)) {
2120 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002121 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2123 break;
2124 case PyUnicode_2BYTE_KIND:
2125#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002126 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002128 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2130#endif
2131 break;
2132 case PyUnicode_4BYTE_KIND:
2133#if SIZEOF_WCHAR_T == 2
2134 /* This is the only case which has to process surrogates, thus
2135 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002136 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137#else
2138 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002139 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140#endif
2141 break;
2142 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002143 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002146 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147}
2148
Alexander Belopolsky40018472011-02-26 01:02:56 +00002149PyObject *
2150PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002151{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 if (size < 0) {
2153 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002154 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 return NULL;
2156 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002157 if (u != NULL)
2158 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2159 else
2160 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002161}
2162
Alexander Belopolsky40018472011-02-26 01:02:56 +00002163PyObject *
2164PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002165{
2166 size_t size = strlen(u);
2167 if (size > PY_SSIZE_T_MAX) {
2168 PyErr_SetString(PyExc_OverflowError, "input too long");
2169 return NULL;
2170 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002171 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002172}
2173
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002174PyObject *
2175_PyUnicode_FromId(_Py_Identifier *id)
2176{
2177 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002178 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2179 strlen(id->string),
2180 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002181 if (!id->object)
2182 return NULL;
2183 PyUnicode_InternInPlace(&id->object);
2184 assert(!id->next);
2185 id->next = static_strings;
2186 static_strings = id;
2187 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002188 return id->object;
2189}
2190
2191void
2192_PyUnicode_ClearStaticStrings()
2193{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002194 _Py_Identifier *tmp, *s = static_strings;
2195 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002196 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002197 tmp = s->next;
2198 s->next = NULL;
2199 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002200 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002201 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002202}
2203
Benjamin Peterson0df54292012-03-26 14:50:32 -04002204/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205
Victor Stinnerd3f08822012-05-29 12:57:52 +02002206PyObject*
2207_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002208{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002209 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002210 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002211 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002212#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002213 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002214#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002215 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002216 }
Victor Stinner785938e2011-12-11 20:09:03 +01002217 unicode = PyUnicode_New(size, 127);
2218 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002219 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002220 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2221 assert(_PyUnicode_CheckConsistency(unicode, 1));
2222 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002223}
2224
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002225static Py_UCS4
2226kind_maxchar_limit(unsigned int kind)
2227{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002228 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002229 case PyUnicode_1BYTE_KIND:
2230 return 0x80;
2231 case PyUnicode_2BYTE_KIND:
2232 return 0x100;
2233 case PyUnicode_4BYTE_KIND:
2234 return 0x10000;
2235 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002236 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002237 }
2238}
2239
Victor Stinner702c7342011-10-05 13:50:52 +02002240static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002241_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002244 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245
Serhiy Storchaka678db842013-01-26 12:16:36 +02002246 if (size == 0)
2247 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002248 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002249 if (size == 1)
2250 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002251
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002252 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002253 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 if (!res)
2255 return NULL;
2256 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002257 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002259}
2260
Victor Stinnere57b1c02011-09-28 22:20:48 +02002261static PyObject*
2262_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263{
2264 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002265 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002266
Serhiy Storchaka678db842013-01-26 12:16:36 +02002267 if (size == 0)
2268 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002269 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002270 if (size == 1)
2271 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002272
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002273 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002274 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 if (!res)
2276 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002277 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002279 else {
2280 _PyUnicode_CONVERT_BYTES(
2281 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2282 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002283 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 return res;
2285}
2286
Victor Stinnere57b1c02011-09-28 22:20:48 +02002287static PyObject*
2288_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289{
2290 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002291 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292
Serhiy Storchaka678db842013-01-26 12:16:36 +02002293 if (size == 0)
2294 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002295 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002296 if (size == 1)
2297 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002298
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002300 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 if (!res)
2302 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002303 if (max_char < 256)
2304 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2305 PyUnicode_1BYTE_DATA(res));
2306 else if (max_char < 0x10000)
2307 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2308 PyUnicode_2BYTE_DATA(res));
2309 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002311 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 return res;
2313}
2314
2315PyObject*
2316PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2317{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002318 if (size < 0) {
2319 PyErr_SetString(PyExc_ValueError, "size must be positive");
2320 return NULL;
2321 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002322 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002324 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002326 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002328 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002329 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002330 PyErr_SetString(PyExc_SystemError, "invalid kind");
2331 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333}
2334
Victor Stinnerece58de2012-04-23 23:36:38 +02002335Py_UCS4
2336_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2337{
2338 enum PyUnicode_Kind kind;
2339 void *startptr, *endptr;
2340
2341 assert(PyUnicode_IS_READY(unicode));
2342 assert(0 <= start);
2343 assert(end <= PyUnicode_GET_LENGTH(unicode));
2344 assert(start <= end);
2345
2346 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2347 return PyUnicode_MAX_CHAR_VALUE(unicode);
2348
2349 if (start == end)
2350 return 127;
2351
Victor Stinner94d558b2012-04-27 22:26:58 +02002352 if (PyUnicode_IS_ASCII(unicode))
2353 return 127;
2354
Victor Stinnerece58de2012-04-23 23:36:38 +02002355 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002356 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002357 endptr = (char *)startptr + end * kind;
2358 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002359 switch(kind) {
2360 case PyUnicode_1BYTE_KIND:
2361 return ucs1lib_find_max_char(startptr, endptr);
2362 case PyUnicode_2BYTE_KIND:
2363 return ucs2lib_find_max_char(startptr, endptr);
2364 case PyUnicode_4BYTE_KIND:
2365 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002366 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002367 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002368 }
2369}
2370
Victor Stinner25a4b292011-10-06 12:31:55 +02002371/* Ensure that a string uses the most efficient storage, if it is not the
2372 case: create a new string with of the right kind. Write NULL into *p_unicode
2373 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002374static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002375unicode_adjust_maxchar(PyObject **p_unicode)
2376{
2377 PyObject *unicode, *copy;
2378 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002379 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002380 unsigned int kind;
2381
2382 assert(p_unicode != NULL);
2383 unicode = *p_unicode;
2384 assert(PyUnicode_IS_READY(unicode));
2385 if (PyUnicode_IS_ASCII(unicode))
2386 return;
2387
2388 len = PyUnicode_GET_LENGTH(unicode);
2389 kind = PyUnicode_KIND(unicode);
2390 if (kind == PyUnicode_1BYTE_KIND) {
2391 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002392 max_char = ucs1lib_find_max_char(u, u + len);
2393 if (max_char >= 128)
2394 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002395 }
2396 else if (kind == PyUnicode_2BYTE_KIND) {
2397 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002398 max_char = ucs2lib_find_max_char(u, u + len);
2399 if (max_char >= 256)
2400 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002401 }
2402 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002403 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002404 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002405 max_char = ucs4lib_find_max_char(u, u + len);
2406 if (max_char >= 0x10000)
2407 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002408 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002409 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002410 if (copy != NULL)
2411 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002412 Py_DECREF(unicode);
2413 *p_unicode = copy;
2414}
2415
Victor Stinner034f6cf2011-09-30 02:26:44 +02002416PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002417_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002418{
Victor Stinner87af4f22011-11-21 23:03:47 +01002419 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002420 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002421
Victor Stinner034f6cf2011-09-30 02:26:44 +02002422 if (!PyUnicode_Check(unicode)) {
2423 PyErr_BadInternalCall();
2424 return NULL;
2425 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002426 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002427 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002428
Victor Stinner87af4f22011-11-21 23:03:47 +01002429 length = PyUnicode_GET_LENGTH(unicode);
2430 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002431 if (!copy)
2432 return NULL;
2433 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2434
Christian Heimesf051e432016-09-13 20:22:02 +02002435 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002436 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002437 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002438 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002439}
2440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441
Victor Stinnerbc603d12011-10-02 01:00:40 +02002442/* Widen Unicode objects to larger buffers. Don't write terminating null
2443 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444
2445void*
2446_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2447{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002448 Py_ssize_t len;
2449 void *result;
2450 unsigned int skind;
2451
Benjamin Petersonbac79492012-01-14 13:34:47 -05002452 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002453 return NULL;
2454
2455 len = PyUnicode_GET_LENGTH(s);
2456 skind = PyUnicode_KIND(s);
2457 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002461 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002462 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002463 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002464 if (!result)
2465 return PyErr_NoMemory();
2466 assert(skind == PyUnicode_1BYTE_KIND);
2467 _PyUnicode_CONVERT_BYTES(
2468 Py_UCS1, Py_UCS2,
2469 PyUnicode_1BYTE_DATA(s),
2470 PyUnicode_1BYTE_DATA(s) + len,
2471 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002473 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002474 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002475 if (!result)
2476 return PyErr_NoMemory();
2477 if (skind == PyUnicode_2BYTE_KIND) {
2478 _PyUnicode_CONVERT_BYTES(
2479 Py_UCS2, Py_UCS4,
2480 PyUnicode_2BYTE_DATA(s),
2481 PyUnicode_2BYTE_DATA(s) + len,
2482 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002484 else {
2485 assert(skind == PyUnicode_1BYTE_KIND);
2486 _PyUnicode_CONVERT_BYTES(
2487 Py_UCS1, Py_UCS4,
2488 PyUnicode_1BYTE_DATA(s),
2489 PyUnicode_1BYTE_DATA(s) + len,
2490 result);
2491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002493 default:
2494 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 }
Victor Stinner01698042011-10-04 00:04:26 +02002496 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 return NULL;
2498}
2499
2500static Py_UCS4*
2501as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
2504 int kind;
2505 void *data;
2506 Py_ssize_t len, targetlen;
2507 if (PyUnicode_READY(string) == -1)
2508 return NULL;
2509 kind = PyUnicode_KIND(string);
2510 data = PyUnicode_DATA(string);
2511 len = PyUnicode_GET_LENGTH(string);
2512 targetlen = len;
2513 if (copy_null)
2514 targetlen++;
2515 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002516 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 if (!target) {
2518 PyErr_NoMemory();
2519 return NULL;
2520 }
2521 }
2522 else {
2523 if (targetsize < targetlen) {
2524 PyErr_Format(PyExc_SystemError,
2525 "string is longer than the buffer");
2526 if (copy_null && 0 < targetsize)
2527 target[0] = 0;
2528 return NULL;
2529 }
2530 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002531 if (kind == PyUnicode_1BYTE_KIND) {
2532 Py_UCS1 *start = (Py_UCS1 *) data;
2533 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002535 else if (kind == PyUnicode_2BYTE_KIND) {
2536 Py_UCS2 *start = (Py_UCS2 *) data;
2537 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2538 }
2539 else {
2540 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002541 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 if (copy_null)
2544 target[len] = 0;
2545 return target;
2546}
2547
2548Py_UCS4*
2549PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2550 int copy_null)
2551{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002552 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 PyErr_BadInternalCall();
2554 return NULL;
2555 }
2556 return as_ucs4(string, target, targetsize, copy_null);
2557}
2558
2559Py_UCS4*
2560PyUnicode_AsUCS4Copy(PyObject *string)
2561{
2562 return as_ucs4(string, NULL, 0, 1);
2563}
2564
Victor Stinner15a11362012-10-06 23:48:20 +02002565/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002566 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2567 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2568#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002569
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002570static int
2571unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2572 Py_ssize_t width, Py_ssize_t precision)
2573{
2574 Py_ssize_t length, fill, arglen;
2575 Py_UCS4 maxchar;
2576
2577 if (PyUnicode_READY(str) == -1)
2578 return -1;
2579
2580 length = PyUnicode_GET_LENGTH(str);
2581 if ((precision == -1 || precision >= length)
2582 && width <= length)
2583 return _PyUnicodeWriter_WriteStr(writer, str);
2584
2585 if (precision != -1)
2586 length = Py_MIN(precision, length);
2587
2588 arglen = Py_MAX(length, width);
2589 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2590 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2591 else
2592 maxchar = writer->maxchar;
2593
2594 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2595 return -1;
2596
2597 if (width > length) {
2598 fill = width - length;
2599 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2600 return -1;
2601 writer->pos += fill;
2602 }
2603
2604 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2605 str, 0, length);
2606 writer->pos += length;
2607 return 0;
2608}
2609
2610static int
Victor Stinner998b8062018-09-12 00:23:25 +02002611unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002612 Py_ssize_t width, Py_ssize_t precision)
2613{
2614 /* UTF-8 */
2615 Py_ssize_t length;
2616 PyObject *unicode;
2617 int res;
2618
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002619 if (precision == -1) {
2620 length = strlen(str);
2621 }
2622 else {
2623 length = 0;
2624 while (length < precision && str[length]) {
2625 length++;
2626 }
2627 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2629 if (unicode == NULL)
2630 return -1;
2631
2632 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2633 Py_DECREF(unicode);
2634 return res;
2635}
2636
Victor Stinner96865452011-03-01 23:44:09 +00002637static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002638unicode_fromformat_arg(_PyUnicodeWriter *writer,
2639 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002640{
Victor Stinnere215d962012-10-06 23:03:36 +02002641 const char *p;
2642 Py_ssize_t len;
2643 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 Py_ssize_t width;
2645 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 int longflag;
2647 int longlongflag;
2648 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002649 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002650
2651 p = f;
2652 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002653 zeropad = 0;
2654 if (*f == '0') {
2655 zeropad = 1;
2656 f++;
2657 }
Victor Stinner96865452011-03-01 23:44:09 +00002658
2659 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 width = -1;
2661 if (Py_ISDIGIT((unsigned)*f)) {
2662 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002663 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002664 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002665 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002666 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002668 return NULL;
2669 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002670 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002671 f++;
2672 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002673 }
2674 precision = -1;
2675 if (*f == '.') {
2676 f++;
2677 if (Py_ISDIGIT((unsigned)*f)) {
2678 precision = (*f - '0');
2679 f++;
2680 while (Py_ISDIGIT((unsigned)*f)) {
2681 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2682 PyErr_SetString(PyExc_ValueError,
2683 "precision too big");
2684 return NULL;
2685 }
2686 precision = (precision * 10) + (*f - '0');
2687 f++;
2688 }
2689 }
Victor Stinner96865452011-03-01 23:44:09 +00002690 if (*f == '%') {
2691 /* "%.3%s" => f points to "3" */
2692 f--;
2693 }
2694 }
2695 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002697 f--;
2698 }
Victor Stinner96865452011-03-01 23:44:09 +00002699
2700 /* Handle %ld, %lu, %lld and %llu. */
2701 longflag = 0;
2702 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002703 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002704 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002705 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002706 longflag = 1;
2707 ++f;
2708 }
Victor Stinner96865452011-03-01 23:44:09 +00002709 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002710 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002711 longlongflag = 1;
2712 f += 2;
2713 }
Victor Stinner96865452011-03-01 23:44:09 +00002714 }
2715 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002716 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002717 size_tflag = 1;
2718 ++f;
2719 }
Victor Stinnere215d962012-10-06 23:03:36 +02002720
2721 if (f[1] == '\0')
2722 writer->overallocate = 0;
2723
2724 switch (*f) {
2725 case 'c':
2726 {
2727 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002728 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002729 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002730 "character argument not in range(0x110000)");
2731 return NULL;
2732 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002733 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002734 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 break;
2736 }
2737
2738 case 'i':
2739 case 'd':
2740 case 'u':
2741 case 'x':
2742 {
2743 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002744 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002746
2747 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002748 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002749 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002750 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002751 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002752 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002753 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002754 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002755 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002756 va_arg(*vargs, size_t));
2757 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002758 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002759 va_arg(*vargs, unsigned int));
2760 }
2761 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002762 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002763 }
2764 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002765 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002766 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002767 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002768 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002769 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002770 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002771 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002772 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002773 va_arg(*vargs, Py_ssize_t));
2774 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002775 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002776 va_arg(*vargs, int));
2777 }
2778 assert(len >= 0);
2779
Victor Stinnere215d962012-10-06 23:03:36 +02002780 if (precision < len)
2781 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002782
2783 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2785 return NULL;
2786
Victor Stinnere215d962012-10-06 23:03:36 +02002787 if (width > precision) {
2788 Py_UCS4 fillchar;
2789 fill = width - precision;
2790 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002791 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2792 return NULL;
2793 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 }
Victor Stinner15a11362012-10-06 23:48:20 +02002795 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002796 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002797 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2798 return NULL;
2799 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002800 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801
Victor Stinner4a587072013-11-19 12:54:53 +01002802 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2803 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002804 break;
2805 }
2806
2807 case 'p':
2808 {
2809 char number[MAX_LONG_LONG_CHARS];
2810
2811 len = sprintf(number, "%p", va_arg(*vargs, void*));
2812 assert(len >= 0);
2813
2814 /* %p is ill-defined: ensure leading 0x. */
2815 if (number[1] == 'X')
2816 number[1] = 'x';
2817 else if (number[1] != 'x') {
2818 memmove(number + 2, number,
2819 strlen(number) + 1);
2820 number[0] = '0';
2821 number[1] = 'x';
2822 len += 2;
2823 }
2824
Victor Stinner4a587072013-11-19 12:54:53 +01002825 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002826 return NULL;
2827 break;
2828 }
2829
2830 case 's':
2831 {
2832 /* UTF-8 */
2833 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002834 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002835 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002836 break;
2837 }
2838
2839 case 'U':
2840 {
2841 PyObject *obj = va_arg(*vargs, PyObject *);
2842 assert(obj && _PyUnicode_CHECK(obj));
2843
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002844 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002845 return NULL;
2846 break;
2847 }
2848
2849 case 'V':
2850 {
2851 PyObject *obj = va_arg(*vargs, PyObject *);
2852 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002853 if (obj) {
2854 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002855 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002856 return NULL;
2857 }
2858 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002859 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002860 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002862 }
2863 break;
2864 }
2865
2866 case 'S':
2867 {
2868 PyObject *obj = va_arg(*vargs, PyObject *);
2869 PyObject *str;
2870 assert(obj);
2871 str = PyObject_Str(obj);
2872 if (!str)
2873 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002874 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002875 Py_DECREF(str);
2876 return NULL;
2877 }
2878 Py_DECREF(str);
2879 break;
2880 }
2881
2882 case 'R':
2883 {
2884 PyObject *obj = va_arg(*vargs, PyObject *);
2885 PyObject *repr;
2886 assert(obj);
2887 repr = PyObject_Repr(obj);
2888 if (!repr)
2889 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002890 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 Py_DECREF(repr);
2892 return NULL;
2893 }
2894 Py_DECREF(repr);
2895 break;
2896 }
2897
2898 case 'A':
2899 {
2900 PyObject *obj = va_arg(*vargs, PyObject *);
2901 PyObject *ascii;
2902 assert(obj);
2903 ascii = PyObject_ASCII(obj);
2904 if (!ascii)
2905 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002906 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002907 Py_DECREF(ascii);
2908 return NULL;
2909 }
2910 Py_DECREF(ascii);
2911 break;
2912 }
2913
2914 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002915 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002916 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 break;
2918
2919 default:
2920 /* if we stumble upon an unknown formatting code, copy the rest
2921 of the format string to the output string. (we cannot just
2922 skip the code, since there's no way to know what's in the
2923 argument list) */
2924 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002925 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002926 return NULL;
2927 f = p+len;
2928 return f;
2929 }
2930
2931 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002932 return f;
2933}
2934
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935PyObject *
2936PyUnicode_FromFormatV(const char *format, va_list vargs)
2937{
Victor Stinnere215d962012-10-06 23:03:36 +02002938 va_list vargs2;
2939 const char *f;
2940 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941
Victor Stinner8f674cc2013-04-17 23:02:17 +02002942 _PyUnicodeWriter_Init(&writer);
2943 writer.min_length = strlen(format) + 100;
2944 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002945
Benjamin Peterson0c212142016-09-20 20:39:33 -07002946 // Copy varags to be able to pass a reference to a subfunction.
2947 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002948
2949 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002950 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 f = unicode_fromformat_arg(&writer, f, &vargs2);
2952 if (f == NULL)
2953 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002956 const char *p;
2957 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002958
Victor Stinnere215d962012-10-06 23:03:36 +02002959 p = f;
2960 do
2961 {
2962 if ((unsigned char)*p > 127) {
2963 PyErr_Format(PyExc_ValueError,
2964 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2965 "string, got a non-ASCII byte: 0x%02x",
2966 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002967 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002968 }
2969 p++;
2970 }
2971 while (*p != '\0' && *p != '%');
2972 len = p - f;
2973
2974 if (*p == '\0')
2975 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002976
2977 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002978 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002979
2980 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002982 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002983 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002984 return _PyUnicodeWriter_Finish(&writer);
2985
2986 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002987 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002988 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002989 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002990}
2991
Walter Dörwaldd2034312007-05-18 16:29:38 +00002992PyObject *
2993PyUnicode_FromFormat(const char *format, ...)
2994{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002995 PyObject* ret;
2996 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002997
2998#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002999 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003000#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003001 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003003 ret = PyUnicode_FromFormatV(format, vargs);
3004 va_end(vargs);
3005 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003006}
3007
Serhiy Storchakac46db922018-10-23 22:58:24 +03003008static Py_ssize_t
3009unicode_get_widechar_size(PyObject *unicode)
3010{
3011 Py_ssize_t res;
3012
3013 assert(unicode != NULL);
3014 assert(_PyUnicode_CHECK(unicode));
3015
3016 if (_PyUnicode_WSTR(unicode) != NULL) {
3017 return PyUnicode_WSTR_LENGTH(unicode);
3018 }
3019 assert(PyUnicode_IS_READY(unicode));
3020
3021 res = _PyUnicode_LENGTH(unicode);
3022#if SIZEOF_WCHAR_T == 2
3023 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3024 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3025 const Py_UCS4 *end = s + res;
3026 for (; s < end; ++s) {
3027 if (*s > 0xFFFF) {
3028 ++res;
3029 }
3030 }
3031 }
3032#endif
3033 return res;
3034}
3035
3036static void
3037unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3038{
3039 const wchar_t *wstr;
3040
3041 assert(unicode != NULL);
3042 assert(_PyUnicode_CHECK(unicode));
3043
3044 wstr = _PyUnicode_WSTR(unicode);
3045 if (wstr != NULL) {
3046 memcpy(w, wstr, size * sizeof(wchar_t));
3047 return;
3048 }
3049 assert(PyUnicode_IS_READY(unicode));
3050
3051 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3052 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3053 for (; size--; ++s, ++w) {
3054 *w = *s;
3055 }
3056 }
3057 else {
3058#if SIZEOF_WCHAR_T == 4
3059 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3060 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3061 for (; size--; ++s, ++w) {
3062 *w = *s;
3063 }
3064#else
3065 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3066 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3067 for (; size--; ++s, ++w) {
3068 Py_UCS4 ch = *s;
3069 if (ch > 0xFFFF) {
3070 assert(ch <= MAX_UNICODE);
3071 /* encode surrogate pair in this case */
3072 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3073 if (!size--)
3074 break;
3075 *w = Py_UNICODE_LOW_SURROGATE(ch);
3076 }
3077 else {
3078 *w = ch;
3079 }
3080 }
3081#endif
3082 }
3083}
3084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003085#ifdef HAVE_WCHAR_H
3086
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003087/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003088
Victor Stinnerd88d9832011-09-06 02:00:05 +02003089 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003090 character) required to convert the unicode object. Ignore size argument.
3091
Victor Stinnerd88d9832011-09-06 02:00:05 +02003092 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003093 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003094 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003095Py_ssize_t
3096PyUnicode_AsWideChar(PyObject *unicode,
3097 wchar_t *w,
3098 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003099{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003100 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003101
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003102 if (unicode == NULL) {
3103 PyErr_BadInternalCall();
3104 return -1;
3105 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003106 if (!PyUnicode_Check(unicode)) {
3107 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003109 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003110
3111 res = unicode_get_widechar_size(unicode);
3112 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003113 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003114 }
3115
3116 if (size > res) {
3117 size = res + 1;
3118 }
3119 else {
3120 res = size;
3121 }
3122 unicode_copy_as_widechar(unicode, w, size);
3123 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003124}
3125
Victor Stinner137c34c2010-09-29 10:25:54 +00003126wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003127PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003128 Py_ssize_t *size)
3129{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003130 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003131 Py_ssize_t buflen;
3132
3133 if (unicode == NULL) {
3134 PyErr_BadInternalCall();
3135 return NULL;
3136 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003137 if (!PyUnicode_Check(unicode)) {
3138 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003139 return NULL;
3140 }
3141
Serhiy Storchakac46db922018-10-23 22:58:24 +03003142 buflen = unicode_get_widechar_size(unicode);
3143 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003144 if (buffer == NULL) {
3145 PyErr_NoMemory();
3146 return NULL;
3147 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003148 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3149 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003151 }
3152 else if (wcslen(buffer) != (size_t)buflen) {
3153 PyMem_FREE(buffer);
3154 PyErr_SetString(PyExc_ValueError,
3155 "embedded null character");
3156 return NULL;
3157 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003158 return buffer;
3159}
3160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003161#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003165{
Victor Stinner8faf8212011-12-08 22:14:11 +01003166 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 PyErr_SetString(PyExc_ValueError,
3168 "chr() arg not in range(0x110000)");
3169 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003170 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003171
Victor Stinner985a82a2014-01-03 12:53:47 +01003172 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003173}
3174
Alexander Belopolsky40018472011-02-26 01:02:56 +00003175PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003176PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003178 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003180 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003181 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003182 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 Py_INCREF(obj);
3184 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003185 }
3186 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 /* For a Unicode subtype that's not a Unicode object,
3188 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003189 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003190 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003191 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003192 "Can't convert '%.100s' object to str implicitly",
3193 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003194 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003195}
3196
Alexander Belopolsky40018472011-02-26 01:02:56 +00003197PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003198PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003199 const char *encoding,
3200 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003201{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003202 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003204
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 PyErr_BadInternalCall();
3207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003209
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003210 /* Decoding bytes objects is the most common case and should be fast */
3211 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003212 if (PyBytes_GET_SIZE(obj) == 0)
3213 _Py_RETURN_UNICODE_EMPTY();
3214 v = PyUnicode_Decode(
3215 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3216 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003217 return v;
3218 }
3219
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003220 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003221 PyErr_SetString(PyExc_TypeError,
3222 "decoding str is not supported");
3223 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003224 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003225
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003226 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3227 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3228 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003229 "decoding to str: need a bytes-like object, %.80s found",
3230 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003231 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003232 }
Tim Petersced69f82003-09-16 20:30:58 +00003233
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003234 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003235 PyBuffer_Release(&buffer);
3236 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003238
Serhiy Storchaka05997252013-01-26 12:14:02 +02003239 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003240 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003241 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242}
3243
Victor Stinnerebe17e02016-10-12 13:57:45 +02003244/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3245 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3246 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003247int
3248_Py_normalize_encoding(const char *encoding,
3249 char *lower,
3250 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003252 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003253 char *l;
3254 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003255 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256
Victor Stinner942889a2016-09-05 15:40:10 -07003257 assert(encoding != NULL);
3258
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003259 e = encoding;
3260 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003261 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003262 punct = 0;
3263 while (1) {
3264 char c = *e;
3265 if (c == 0) {
3266 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003267 }
Victor Stinner942889a2016-09-05 15:40:10 -07003268
3269 if (Py_ISALNUM(c) || c == '.') {
3270 if (punct && l != lower) {
3271 if (l == l_end) {
3272 return 0;
3273 }
3274 *l++ = '_';
3275 }
3276 punct = 0;
3277
3278 if (l == l_end) {
3279 return 0;
3280 }
3281 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003282 }
3283 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003284 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003285 }
Victor Stinner942889a2016-09-05 15:40:10 -07003286
3287 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003288 }
3289 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003290 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003291}
3292
Alexander Belopolsky40018472011-02-26 01:02:56 +00003293PyObject *
3294PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003295 Py_ssize_t size,
3296 const char *encoding,
3297 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003298{
3299 PyObject *buffer = NULL, *unicode;
3300 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003301 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3302
3303 if (encoding == NULL) {
3304 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3305 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003306
Fred Drakee4315f52000-05-09 19:53:39 +00003307 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003308 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3309 char *lower = buflower;
3310
3311 /* Fast paths */
3312 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3313 lower += 3;
3314 if (*lower == '_') {
3315 /* Match "utf8" and "utf_8" */
3316 lower++;
3317 }
3318
3319 if (lower[0] == '8' && lower[1] == 0) {
3320 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3321 }
3322 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3323 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3324 }
3325 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3326 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3327 }
3328 }
3329 else {
3330 if (strcmp(lower, "ascii") == 0
3331 || strcmp(lower, "us_ascii") == 0) {
3332 return PyUnicode_DecodeASCII(s, size, errors);
3333 }
Steve Dowercc16be82016-09-08 10:35:16 -07003334 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003335 else if (strcmp(lower, "mbcs") == 0) {
3336 return PyUnicode_DecodeMBCS(s, size, errors);
3337 }
3338 #endif
3339 else if (strcmp(lower, "latin1") == 0
3340 || strcmp(lower, "latin_1") == 0
3341 || strcmp(lower, "iso_8859_1") == 0
3342 || strcmp(lower, "iso8859_1") == 0) {
3343 return PyUnicode_DecodeLatin1(s, size, errors);
3344 }
3345 }
Victor Stinner37296e82010-06-10 13:36:23 +00003346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347
3348 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003349 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003350 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003351 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003352 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (buffer == NULL)
3354 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003355 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 if (unicode == NULL)
3357 goto onError;
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003360 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003361 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003362 encoding,
3363 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 Py_DECREF(unicode);
3365 goto onError;
3366 }
3367 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003368 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003369
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 Py_XDECREF(buffer);
3372 return NULL;
3373}
3374
Alexander Belopolsky40018472011-02-26 01:02:56 +00003375PyObject *
3376PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003377 const char *encoding,
3378 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003379{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003380 if (!PyUnicode_Check(unicode)) {
3381 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003382 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003383 }
3384
Serhiy Storchaka00939072016-10-27 21:05:49 +03003385 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3386 "PyUnicode_AsDecodedObject() is deprecated; "
3387 "use PyCodec_Decode() to decode from str", 1) < 0)
3388 return NULL;
3389
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003390 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003392
3393 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003394 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 const char *encoding,
3400 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401{
3402 PyObject *v;
3403
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 goto onError;
3407 }
3408
Serhiy Storchaka00939072016-10-27 21:05:49 +03003409 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3410 "PyUnicode_AsDecodedUnicode() is deprecated; "
3411 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3412 return NULL;
3413
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003414 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003415 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416
3417 /* Decode via the codec registry */
3418 v = PyCodec_Decode(unicode, encoding, errors);
3419 if (v == NULL)
3420 goto onError;
3421 if (!PyUnicode_Check(v)) {
3422 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003423 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003424 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003425 encoding,
3426 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003427 Py_DECREF(v);
3428 goto onError;
3429 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003430 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003431
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003433 return NULL;
3434}
3435
Alexander Belopolsky40018472011-02-26 01:02:56 +00003436PyObject *
3437PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003438 Py_ssize_t size,
3439 const char *encoding,
3440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441{
3442 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003443
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003444 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3448 Py_DECREF(unicode);
3449 return v;
3450}
3451
Alexander Belopolsky40018472011-02-26 01:02:56 +00003452PyObject *
3453PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003454 const char *encoding,
3455 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456{
3457 PyObject *v;
3458
3459 if (!PyUnicode_Check(unicode)) {
3460 PyErr_BadArgument();
3461 goto onError;
3462 }
3463
Serhiy Storchaka00939072016-10-27 21:05:49 +03003464 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3465 "PyUnicode_AsEncodedObject() is deprecated; "
3466 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3467 "or PyCodec_Encode() for generic encoding", 1) < 0)
3468 return NULL;
3469
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003470 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003471 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003472
3473 /* Encode via the codec registry */
3474 v = PyCodec_Encode(unicode, encoding, errors);
3475 if (v == NULL)
3476 goto onError;
3477 return v;
3478
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003480 return NULL;
3481}
3482
Victor Stinner1b579672011-12-17 05:47:23 +01003483
Victor Stinner2cba6b82018-01-10 22:46:15 +01003484static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003485unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003486 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003487{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 Py_ssize_t wlen;
3489 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3490 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003492 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003494 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003495 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003496 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003497 return NULL;
3498 }
3499
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003500 char *str;
3501 size_t error_pos;
3502 const char *reason;
3503 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003504 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003505 PyMem_Free(wstr);
3506
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003507 if (res != 0) {
3508 if (res == -2) {
3509 PyObject *exc;
3510 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3511 "locale", unicode,
3512 (Py_ssize_t)error_pos,
3513 (Py_ssize_t)(error_pos+1),
3514 reason);
3515 if (exc != NULL) {
3516 PyCodec_StrictErrors(exc);
3517 Py_DECREF(exc);
3518 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003519 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003520 else if (res == -3) {
3521 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3522 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003523 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003524 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003525 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003526 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003527 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003528
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003529 PyObject *bytes = PyBytes_FromString(str);
3530 PyMem_RawFree(str);
3531 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003532}
3533
Victor Stinnerad158722010-10-27 00:25:46 +00003534PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003535PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3536{
Victor Stinner709d23d2019-05-02 14:56:30 -04003537 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3538 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003539}
3540
3541PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003542PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003543{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003544 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003545#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003546 if (interp->fs_codec.encoding) {
3547 return unicode_encode_utf8(unicode,
3548 interp->fs_codec.error_handler,
3549 interp->fs_codec.errors);
3550 }
3551 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003552 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003553 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003554 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003555 assert(errors != _Py_ERROR_UNKNOWN);
3556 return unicode_encode_utf8(unicode, errors, NULL);
3557 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003558#else
Victor Stinner793b5312011-04-27 00:24:21 +02003559 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3560 cannot use it to encode and decode filenames before it is loaded. Load
3561 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003562 implementation of the locale codec until the codec registry is
3563 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003564 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003565 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003566 interp->fs_codec.encoding,
3567 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003568 }
3569 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003570 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003571 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003572 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003573 assert(errors != _Py_ERROR_UNKNOWN);
3574 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003575 }
Victor Stinnerad158722010-10-27 00:25:46 +00003576#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003577}
3578
Alexander Belopolsky40018472011-02-26 01:02:56 +00003579PyObject *
3580PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003581 const char *encoding,
3582 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583{
3584 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003585 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (!PyUnicode_Check(unicode)) {
3588 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 }
Fred Drakee4315f52000-05-09 19:53:39 +00003591
Victor Stinner942889a2016-09-05 15:40:10 -07003592 if (encoding == NULL) {
3593 return _PyUnicode_AsUTF8String(unicode, errors);
3594 }
3595
Fred Drakee4315f52000-05-09 19:53:39 +00003596 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003597 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3598 char *lower = buflower;
3599
3600 /* Fast paths */
3601 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3602 lower += 3;
3603 if (*lower == '_') {
3604 /* Match "utf8" and "utf_8" */
3605 lower++;
3606 }
3607
3608 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003610 }
3611 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3612 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3613 }
3614 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3615 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3616 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003617 }
Victor Stinner942889a2016-09-05 15:40:10 -07003618 else {
3619 if (strcmp(lower, "ascii") == 0
3620 || strcmp(lower, "us_ascii") == 0) {
3621 return _PyUnicode_AsASCIIString(unicode, errors);
3622 }
Steve Dowercc16be82016-09-08 10:35:16 -07003623#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003624 else if (strcmp(lower, "mbcs") == 0) {
3625 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3626 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003627#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003628 else if (strcmp(lower, "latin1") == 0 ||
3629 strcmp(lower, "latin_1") == 0 ||
3630 strcmp(lower, "iso_8859_1") == 0 ||
3631 strcmp(lower, "iso8859_1") == 0) {
3632 return _PyUnicode_AsLatin1String(unicode, errors);
3633 }
3634 }
Victor Stinner37296e82010-06-10 13:36:23 +00003635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
3637 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003638 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003640 return NULL;
3641
3642 /* The normal path */
3643 if (PyBytes_Check(v))
3644 return v;
3645
3646 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003648 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003649 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003650
3651 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "encoder %s returned bytearray instead of bytes; "
3653 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003654 encoding);
3655 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003656 Py_DECREF(v);
3657 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003660 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3661 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003662 Py_DECREF(v);
3663 return b;
3664 }
3665
3666 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003667 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003668 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003669 encoding,
3670 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003671 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003672 return NULL;
3673}
3674
Alexander Belopolsky40018472011-02-26 01:02:56 +00003675PyObject *
3676PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003677 const char *encoding,
3678 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003679{
3680 PyObject *v;
3681
3682 if (!PyUnicode_Check(unicode)) {
3683 PyErr_BadArgument();
3684 goto onError;
3685 }
3686
Serhiy Storchaka00939072016-10-27 21:05:49 +03003687 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3688 "PyUnicode_AsEncodedUnicode() is deprecated; "
3689 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3690 return NULL;
3691
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003692 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003694
3695 /* Encode via the codec registry */
3696 v = PyCodec_Encode(unicode, encoding, errors);
3697 if (v == NULL)
3698 goto onError;
3699 if (!PyUnicode_Check(v)) {
3700 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003701 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003702 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003703 encoding,
3704 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003705 Py_DECREF(v);
3706 goto onError;
3707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003709
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 return NULL;
3712}
3713
Victor Stinner2cba6b82018-01-10 22:46:15 +01003714static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003715unicode_decode_locale(const char *str, Py_ssize_t len,
3716 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003718 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3719 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003720 return NULL;
3721 }
3722
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003723 wchar_t *wstr;
3724 size_t wlen;
3725 const char *reason;
3726 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003727 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003728 if (res != 0) {
3729 if (res == -2) {
3730 PyObject *exc;
3731 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3732 "locale", str, len,
3733 (Py_ssize_t)wlen,
3734 (Py_ssize_t)(wlen + 1),
3735 reason);
3736 if (exc != NULL) {
3737 PyCodec_StrictErrors(exc);
3738 Py_DECREF(exc);
3739 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003740 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003741 else if (res == -3) {
3742 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3743 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003744 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003745 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003746 }
Victor Stinner2f197072011-12-17 07:08:30 +01003747 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003748 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003749
3750 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3751 PyMem_RawFree(wstr);
3752 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003753}
3754
3755PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003756PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3757 const char *errors)
3758{
Victor Stinner709d23d2019-05-02 14:56:30 -04003759 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3760 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003761}
3762
3763PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003764PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003765{
3766 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003767 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3768 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003769}
3770
3771
3772PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003773PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003774 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003775 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3776}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003777
Christian Heimes5894ba72007-11-04 11:43:14 +00003778PyObject*
3779PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3780{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003781 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003782#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003783 if (interp->fs_codec.encoding) {
3784 return unicode_decode_utf8(s, size,
3785 interp->fs_codec.error_handler,
3786 interp->fs_codec.errors,
3787 NULL);
3788 }
3789 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003790 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003791 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003792 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003793 assert(errors != _Py_ERROR_UNKNOWN);
3794 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3795 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003796#else
Victor Stinner793b5312011-04-27 00:24:21 +02003797 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3798 cannot use it to encode and decode filenames before it is loaded. Load
3799 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003800 implementation of the locale codec until the codec registry is
3801 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003802 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003803 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003804 interp->fs_codec.encoding,
3805 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003806 }
3807 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003808 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003809 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003810 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003811 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003812 }
Victor Stinnerad158722010-10-27 00:25:46 +00003813#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003814}
3815
Martin v. Löwis011e8422009-05-05 04:43:17 +00003816
3817int
3818PyUnicode_FSConverter(PyObject* arg, void* addr)
3819{
Brett Cannonec6ce872016-09-06 15:50:29 -07003820 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003821 PyObject *output = NULL;
3822 Py_ssize_t size;
3823 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003824 if (arg == NULL) {
3825 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003826 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003827 return 1;
3828 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003829 path = PyOS_FSPath(arg);
3830 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003831 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003832 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003833 if (PyBytes_Check(path)) {
3834 output = path;
3835 }
3836 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3837 output = PyUnicode_EncodeFSDefault(path);
3838 Py_DECREF(path);
3839 if (!output) {
3840 return 0;
3841 }
3842 assert(PyBytes_Check(output));
3843 }
3844
Victor Stinner0ea2a462010-04-30 00:22:08 +00003845 size = PyBytes_GET_SIZE(output);
3846 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003847 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003848 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003849 Py_DECREF(output);
3850 return 0;
3851 }
3852 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003853 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003854}
3855
3856
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003857int
3858PyUnicode_FSDecoder(PyObject* arg, void* addr)
3859{
Brett Cannona5711202016-09-06 19:36:01 -07003860 int is_buffer = 0;
3861 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003862 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003863 if (arg == NULL) {
3864 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003865 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003866 return 1;
3867 }
Brett Cannona5711202016-09-06 19:36:01 -07003868
3869 is_buffer = PyObject_CheckBuffer(arg);
3870 if (!is_buffer) {
3871 path = PyOS_FSPath(arg);
3872 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003873 return 0;
3874 }
Brett Cannona5711202016-09-06 19:36:01 -07003875 }
3876 else {
3877 path = arg;
3878 Py_INCREF(arg);
3879 }
3880
3881 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003882 output = path;
3883 }
3884 else if (PyBytes_Check(path) || is_buffer) {
3885 PyObject *path_bytes = NULL;
3886
3887 if (!PyBytes_Check(path) &&
3888 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003889 "path should be string, bytes, or os.PathLike, not %.200s",
3890 Py_TYPE(arg)->tp_name)) {
3891 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003892 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003893 }
3894 path_bytes = PyBytes_FromObject(path);
3895 Py_DECREF(path);
3896 if (!path_bytes) {
3897 return 0;
3898 }
3899 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3900 PyBytes_GET_SIZE(path_bytes));
3901 Py_DECREF(path_bytes);
3902 if (!output) {
3903 return 0;
3904 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003905 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003906 else {
3907 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003908 "path should be string, bytes, or os.PathLike, not %.200s",
3909 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003910 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003911 return 0;
3912 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003913 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003914 Py_DECREF(output);
3915 return 0;
3916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003918 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003919 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003920 Py_DECREF(output);
3921 return 0;
3922 }
3923 *(PyObject**)addr = output;
3924 return Py_CLEANUP_SUPPORTED;
3925}
3926
3927
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003928const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003930{
Christian Heimesf3863112007-11-22 07:46:41 +00003931 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003933 if (!PyUnicode_Check(unicode)) {
3934 PyErr_BadArgument();
3935 return NULL;
3936 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003937 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003940 if (PyUnicode_UTF8(unicode) == NULL) {
3941 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003942 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 if (bytes == NULL)
3944 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3946 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003947 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 Py_DECREF(bytes);
3949 return NULL;
3950 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003951 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003952 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 PyBytes_AS_STRING(bytes),
3954 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 Py_DECREF(bytes);
3956 }
3957
3958 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003959 *psize = PyUnicode_UTF8_LENGTH(unicode);
3960 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003961}
3962
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003963const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3967}
3968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969Py_UNICODE *
3970PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 if (!PyUnicode_Check(unicode)) {
3973 PyErr_BadArgument();
3974 return NULL;
3975 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003976 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3977 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003979 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981
Serhiy Storchakac46db922018-10-23 22:58:24 +03003982 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3983 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3984 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003987 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3988 if (w == NULL) {
3989 PyErr_NoMemory();
3990 return NULL;
3991 }
3992 unicode_copy_as_widechar(unicode, w, wlen + 1);
3993 _PyUnicode_WSTR(unicode) = w;
3994 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3995 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 }
3997 }
3998 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004000 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004001}
4002
Alexander Belopolsky40018472011-02-26 01:02:56 +00004003Py_UNICODE *
4004PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007}
4008
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004009const Py_UNICODE *
4010_PyUnicode_AsUnicode(PyObject *unicode)
4011{
4012 Py_ssize_t size;
4013 const Py_UNICODE *wstr;
4014
4015 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4016 if (wstr && wcslen(wstr) != (size_t)size) {
4017 PyErr_SetString(PyExc_ValueError, "embedded null character");
4018 return NULL;
4019 }
4020 return wstr;
4021}
4022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023
Alexander Belopolsky40018472011-02-26 01:02:56 +00004024Py_ssize_t
4025PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026{
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 goto onError;
4030 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004031 if (_PyUnicode_WSTR(unicode) == NULL) {
4032 if (PyUnicode_AsUnicode(unicode) == NULL)
4033 goto onError;
4034 }
4035 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 return -1;
4039}
4040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041Py_ssize_t
4042PyUnicode_GetLength(PyObject *unicode)
4043{
Victor Stinner07621332012-06-16 04:53:46 +02004044 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 PyErr_BadArgument();
4046 return -1;
4047 }
Victor Stinner07621332012-06-16 04:53:46 +02004048 if (PyUnicode_READY(unicode) == -1)
4049 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 return PyUnicode_GET_LENGTH(unicode);
4051}
4052
4053Py_UCS4
4054PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4055{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004056 void *data;
4057 int kind;
4058
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004059 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004060 PyErr_BadArgument();
4061 return (Py_UCS4)-1;
4062 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004063 if (PyUnicode_READY(unicode) == -1) {
4064 return (Py_UCS4)-1;
4065 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004066 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004067 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 return (Py_UCS4)-1;
4069 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004070 data = PyUnicode_DATA(unicode);
4071 kind = PyUnicode_KIND(unicode);
4072 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004073}
4074
4075int
4076PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4077{
4078 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004079 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 return -1;
4081 }
Victor Stinner488fa492011-12-12 00:01:39 +01004082 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004083 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004084 PyErr_SetString(PyExc_IndexError, "string index out of range");
4085 return -1;
4086 }
Victor Stinner488fa492011-12-12 00:01:39 +01004087 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004088 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004089 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4090 PyErr_SetString(PyExc_ValueError, "character out of range");
4091 return -1;
4092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4094 index, ch);
4095 return 0;
4096}
4097
Alexander Belopolsky40018472011-02-26 01:02:56 +00004098const char *
4099PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004100{
Victor Stinner42cb4622010-09-01 19:39:01 +00004101 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004102}
4103
Victor Stinner554f3f02010-06-16 23:33:54 +00004104/* create or adjust a UnicodeDecodeError */
4105static void
4106make_decode_exception(PyObject **exceptionObject,
4107 const char *encoding,
4108 const char *input, Py_ssize_t length,
4109 Py_ssize_t startpos, Py_ssize_t endpos,
4110 const char *reason)
4111{
4112 if (*exceptionObject == NULL) {
4113 *exceptionObject = PyUnicodeDecodeError_Create(
4114 encoding, input, length, startpos, endpos, reason);
4115 }
4116 else {
4117 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4118 goto onError;
4119 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4120 goto onError;
4121 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4122 goto onError;
4123 }
4124 return;
4125
4126onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004127 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004128}
4129
Steve Dowercc16be82016-09-08 10:35:16 -07004130#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004131static int
4132widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4133{
4134 if (newsize > *size) {
4135 wchar_t *newbuf = *buf;
4136 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4137 PyErr_NoMemory();
4138 return -1;
4139 }
4140 *buf = newbuf;
4141 }
4142 *size = newsize;
4143 return 0;
4144}
4145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146/* error handling callback helper:
4147 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004148 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 and adjust various state variables.
4150 return 0 on success, -1 on error
4151*/
4152
Alexander Belopolsky40018472011-02-26 01:02:56 +00004153static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154unicode_decode_call_errorhandler_wchar(
4155 const char *errors, PyObject **errorHandler,
4156 const char *encoding, const char *reason,
4157 const char **input, const char **inend, Py_ssize_t *startinpos,
4158 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004159 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004161 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162
4163 PyObject *restuple = NULL;
4164 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004165 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004166 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t requiredsize;
4168 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004169 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 wchar_t *repwstr;
4171 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172
4173 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 *errorHandler = PyCodec_LookupError(errors);
4175 if (*errorHandler == NULL)
4176 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 }
4178
Victor Stinner554f3f02010-06-16 23:33:54 +00004179 make_decode_exception(exceptionObject,
4180 encoding,
4181 *input, *inend - *input,
4182 *startinpos, *endinpos,
4183 reason);
4184 if (*exceptionObject == NULL)
4185 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004187 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004191 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004194 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004196
4197 /* Copy back the bytes variables, which might have been modified by the
4198 callback */
4199 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4200 if (!inputobj)
4201 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202 *input = PyBytes_AS_STRING(inputobj);
4203 insize = PyBytes_GET_SIZE(inputobj);
4204 *inend = *input + insize;
4205 /* we can DECREF safely, as the exception has another reference,
4206 so the object won't go away. */
4207 Py_DECREF(inputobj);
4208
4209 if (newpos<0)
4210 newpos = insize+newpos;
4211 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004212 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213 goto onError;
4214 }
4215
4216 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4217 if (repwstr == NULL)
4218 goto onError;
4219 /* need more space? (at least enough for what we
4220 have+the replacement+the rest of the string (starting
4221 at the new input position), so we won't have to check space
4222 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004223 requiredsize = *outpos;
4224 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4225 goto overflow;
4226 requiredsize += repwlen;
4227 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4228 goto overflow;
4229 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004230 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004231 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004232 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004234 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004236 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004238 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 *endinpos = newpos;
4241 *inptr = *input + newpos;
4242
4243 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004244 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245 return 0;
4246
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004247 overflow:
4248 PyErr_SetString(PyExc_OverflowError,
4249 "decoded result is too long for a Python string");
4250
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 onError:
4252 Py_XDECREF(restuple);
4253 return -1;
4254}
Steve Dowercc16be82016-09-08 10:35:16 -07004255#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004256
4257static int
4258unicode_decode_call_errorhandler_writer(
4259 const char *errors, PyObject **errorHandler,
4260 const char *encoding, const char *reason,
4261 const char **input, const char **inend, Py_ssize_t *startinpos,
4262 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4263 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4264{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004265 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266
4267 PyObject *restuple = NULL;
4268 PyObject *repunicode = NULL;
4269 Py_ssize_t insize;
4270 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004271 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004272 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004274 int need_to_grow = 0;
4275 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004276
4277 if (*errorHandler == NULL) {
4278 *errorHandler = PyCodec_LookupError(errors);
4279 if (*errorHandler == NULL)
4280 goto onError;
4281 }
4282
4283 make_decode_exception(exceptionObject,
4284 encoding,
4285 *input, *inend - *input,
4286 *startinpos, *endinpos,
4287 reason);
4288 if (*exceptionObject == NULL)
4289 goto onError;
4290
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004291 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 if (restuple == NULL)
4293 goto onError;
4294 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 goto onError;
4297 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004298 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004300
4301 /* Copy back the bytes variables, which might have been modified by the
4302 callback */
4303 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4304 if (!inputobj)
4305 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004306 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004307 *input = PyBytes_AS_STRING(inputobj);
4308 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004309 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004310 /* we can DECREF safely, as the exception has another reference,
4311 so the object won't go away. */
4312 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004316 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004317 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320
Victor Stinner170ca6f2013-04-18 00:25:28 +02004321 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004322 if (replen > 1) {
4323 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004324 need_to_grow = 1;
4325 }
4326 new_inptr = *input + newpos;
4327 if (*inend - new_inptr > remain) {
4328 /* We don't know the decoding algorithm here so we make the worst
4329 assumption that one byte decodes to one unicode character.
4330 If unfortunately one byte could decode to more unicode characters,
4331 the decoder may write out-of-bound then. Is it possible for the
4332 algorithms using this function? */
4333 writer->min_length += *inend - new_inptr - remain;
4334 need_to_grow = 1;
4335 }
4336 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004337 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004338 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004339 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4340 goto onError;
4341 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004343 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004346 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004349 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355}
4356
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357/* --- UTF-7 Codec -------------------------------------------------------- */
4358
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359/* See RFC2152 for details. We encode conservatively and decode liberally. */
4360
4361/* Three simple macros defining base-64. */
4362
4363/* Is c a base-64 character? */
4364
4365#define IS_BASE64(c) \
4366 (((c) >= 'A' && (c) <= 'Z') || \
4367 ((c) >= 'a' && (c) <= 'z') || \
4368 ((c) >= '0' && (c) <= '9') || \
4369 (c) == '+' || (c) == '/')
4370
4371/* given that c is a base-64 character, what is its base-64 value? */
4372
4373#define FROM_BASE64(c) \
4374 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4375 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4376 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4377 (c) == '+' ? 62 : 63)
4378
4379/* What is the base-64 character of the bottom 6 bits of n? */
4380
4381#define TO_BASE64(n) \
4382 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4383
4384/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4385 * decoded as itself. We are permissive on decoding; the only ASCII
4386 * byte not decoding to itself is the + which begins a base64
4387 * string. */
4388
4389#define DECODE_DIRECT(c) \
4390 ((c) <= 127 && (c) != '+')
4391
4392/* The UTF-7 encoder treats ASCII characters differently according to
4393 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4394 * the above). See RFC2152. This array identifies these different
4395 * sets:
4396 * 0 : "Set D"
4397 * alphanumeric and '(),-./:?
4398 * 1 : "Set O"
4399 * !"#$%&*;<=>@[]^_`{|}
4400 * 2 : "whitespace"
4401 * ht nl cr sp
4402 * 3 : special (must be base64 encoded)
4403 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4404 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405
Tim Petersced69f82003-09-16 20:30:58 +00004406static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407char utf7_category[128] = {
4408/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4409 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4410/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4411 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4412/* sp ! " # $ % & ' ( ) * + , - . / */
4413 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4414/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4416/* @ A B C D E F G H I J K L M N O */
4417 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4418/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4420/* ` a b c d e f g h i j k l m n o */
4421 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4422/* p q r s t u v w x y z { | } ~ del */
4423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424};
4425
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426/* ENCODE_DIRECT: this character should be encoded as itself. The
4427 * answer depends on whether we are encoding set O as itself, and also
4428 * on whether we are encoding whitespace as itself. RFC2152 makes it
4429 * clear that the answers to these questions vary between
4430 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004431
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432#define ENCODE_DIRECT(c, directO, directWS) \
4433 ((c) < 128 && (c) > 0 && \
4434 ((utf7_category[(c)] == 0) || \
4435 (directWS && (utf7_category[(c)] == 2)) || \
4436 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437
Alexander Belopolsky40018472011-02-26 01:02:56 +00004438PyObject *
4439PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004440 Py_ssize_t size,
4441 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004443 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4444}
4445
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446/* The decoder. The only state we preserve is our read position,
4447 * i.e. how many characters we have consumed. So if we end in the
4448 * middle of a shift sequence we have to back off the read position
4449 * and the output to the beginning of the sequence, otherwise we lose
4450 * all the shift state (seen bits, number of bits seen, high
4451 * surrogate). */
4452
Alexander Belopolsky40018472011-02-26 01:02:56 +00004453PyObject *
4454PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004455 Py_ssize_t size,
4456 const char *errors,
4457 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004460 Py_ssize_t startinpos;
4461 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 const char *errmsg = "";
4465 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004466 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 unsigned int base64bits = 0;
4468 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004469 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 PyObject *errorHandler = NULL;
4471 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004473 if (size == 0) {
4474 if (consumed)
4475 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004476 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004477 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004479 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004480 _PyUnicodeWriter_Init(&writer);
4481 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004482
4483 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 e = s + size;
4485
4486 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004487 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004489 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 if (inShift) { /* in a base-64 section */
4492 if (IS_BASE64(ch)) { /* consume a base-64 character */
4493 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4494 base64bits += 6;
4495 s++;
4496 if (base64bits >= 16) {
4497 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004498 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 base64bits -= 16;
4500 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004501 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 if (surrogate) {
4503 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004504 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4505 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004506 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004507 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004509 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 }
4511 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004512 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004513 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 }
4516 }
Victor Stinner551ac952011-11-29 22:58:13 +01004517 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 /* first surrogate */
4519 surrogate = outCh;
4520 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004522 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004523 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 }
4525 }
4526 }
4527 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 if (base64bits > 0) { /* left-over bits */
4530 if (base64bits >= 6) {
4531 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004532 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 errmsg = "partial character in shift sequence";
4534 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 else {
4537 /* Some bits remain; they should be zero */
4538 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004539 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 errmsg = "non-zero padding bits in shift sequence";
4541 goto utf7Error;
4542 }
4543 }
4544 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004545 if (surrogate && DECODE_DIRECT(ch)) {
4546 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4547 goto onError;
4548 }
4549 surrogate = 0;
4550 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 /* '-' is absorbed; other terminating
4552 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004553 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
4556 }
4557 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 s++; /* consume '+' */
4560 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004562 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004563 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004565 else if (s < e && !IS_BASE64(*s)) {
4566 s++;
4567 errmsg = "ill-formed sequence";
4568 goto utf7Error;
4569 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004572 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004575 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576 }
4577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004580 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004581 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 else {
4584 startinpos = s-starts;
4585 s++;
4586 errmsg = "unexpected special character";
4587 goto utf7Error;
4588 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004592 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 errors, &errorHandler,
4594 "utf7", errmsg,
4595 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004596 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 }
4599
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* end of string */
4601
4602 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4603 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004604 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 if (surrogate ||
4606 (base64bits >= 6) ||
4607 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004609 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 errors, &errorHandler,
4611 "utf7", "unterminated shift sequence",
4612 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004613 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 goto onError;
4615 if (s < e)
4616 goto restart;
4617 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619
4620 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004621 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004623 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004624 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004625 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004626 writer.kind, writer.data, shiftOutStart);
4627 Py_XDECREF(errorHandler);
4628 Py_XDECREF(exc);
4629 _PyUnicodeWriter_Dealloc(&writer);
4630 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004631 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004632 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 }
4634 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004635 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 Py_XDECREF(errorHandler);
4640 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004641 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 Py_XDECREF(errorHandler);
4645 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 return NULL;
4648}
4649
4650
Alexander Belopolsky40018472011-02-26 01:02:56 +00004651PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004652_PyUnicode_EncodeUTF7(PyObject *str,
4653 int base64SetO,
4654 int base64WhiteSpace,
4655 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004657 int kind;
4658 void *data;
4659 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004661 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004662 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 unsigned int base64bits = 0;
4664 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 char * out;
4666 char * start;
4667
Benjamin Petersonbac79492012-01-14 13:34:47 -05004668 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004669 return NULL;
4670 kind = PyUnicode_KIND(str);
4671 data = PyUnicode_DATA(str);
4672 len = PyUnicode_GET_LENGTH(str);
4673
4674 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004677 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004678 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004679 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004680 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681 if (v == NULL)
4682 return NULL;
4683
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004684 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004685 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004686 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 if (inShift) {
4689 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4690 /* shifting out */
4691 if (base64bits) { /* output remaining bits */
4692 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4693 base64buffer = 0;
4694 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
4696 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 /* Characters not in the BASE64 set implicitly unshift the sequence
4698 so no '-' is required, except if the character is itself a '-' */
4699 if (IS_BASE64(ch) || ch == '-') {
4700 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 *out++ = (char) ch;
4703 }
4704 else {
4705 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 else { /* not in a shift sequence */
4709 if (ch == '+') {
4710 *out++ = '+';
4711 *out++ = '-';
4712 }
4713 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4714 *out++ = (char) ch;
4715 }
4716 else {
4717 *out++ = '+';
4718 inShift = 1;
4719 goto encode_char;
4720 }
4721 }
4722 continue;
4723encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004725 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004726
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 /* code first surrogate */
4728 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004729 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730 while (base64bits >= 6) {
4731 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4732 base64bits -= 6;
4733 }
4734 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004735 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004736 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 base64bits += 16;
4738 base64buffer = (base64buffer << 16) | ch;
4739 while (base64bits >= 6) {
4740 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4741 base64bits -= 6;
4742 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004743 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744 if (base64bits)
4745 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4746 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 if (_PyBytes_Resize(&v, out - start) < 0)
4749 return NULL;
4750 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004752PyObject *
4753PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4754 Py_ssize_t size,
4755 int base64SetO,
4756 int base64WhiteSpace,
4757 const char *errors)
4758{
4759 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004760 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004761 if (tmp == NULL)
4762 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004763 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004764 base64WhiteSpace, errors);
4765 Py_DECREF(tmp);
4766 return result;
4767}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004768
Antoine Pitrou244651a2009-05-04 18:56:13 +00004769#undef IS_BASE64
4770#undef FROM_BASE64
4771#undef TO_BASE64
4772#undef DECODE_DIRECT
4773#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775/* --- UTF-8 Codec -------------------------------------------------------- */
4776
Alexander Belopolsky40018472011-02-26 01:02:56 +00004777PyObject *
4778PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004779 Py_ssize_t size,
4780 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781{
Walter Dörwald69652032004-09-07 20:24:22 +00004782 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4783}
4784
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785#include "stringlib/asciilib.h"
4786#include "stringlib/codecs.h"
4787#include "stringlib/undef.h"
4788
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004789#include "stringlib/ucs1lib.h"
4790#include "stringlib/codecs.h"
4791#include "stringlib/undef.h"
4792
4793#include "stringlib/ucs2lib.h"
4794#include "stringlib/codecs.h"
4795#include "stringlib/undef.h"
4796
4797#include "stringlib/ucs4lib.h"
4798#include "stringlib/codecs.h"
4799#include "stringlib/undef.h"
4800
Antoine Pitrouab868312009-01-10 15:40:25 +00004801/* Mask to quickly check whether a C 'long' contains a
4802 non-ASCII, UTF8-encoded char. */
4803#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004804# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004805#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004806# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004807#else
4808# error C 'long' size should be either 4 or 8!
4809#endif
4810
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811static Py_ssize_t
4812ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004814 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004815 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004817 /*
4818 * Issue #17237: m68k is a bit different from most architectures in
4819 * that objects do not use "natural alignment" - for example, int and
4820 * long are only aligned at 2-byte boundaries. Therefore the assert()
4821 * won't work; also, tests have shown that skipping the "optimised
4822 * version" will even speed up m68k.
4823 */
4824#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004825#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004826 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4827 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 /* Fast path, see in STRINGLIB(utf8_decode) for
4829 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004830 /* Help allocation */
4831 const char *_p = p;
4832 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 while (_p < aligned_end) {
4834 unsigned long value = *(const unsigned long *) _p;
4835 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 *((unsigned long *)q) = value;
4838 _p += SIZEOF_LONG;
4839 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 p = _p;
4842 while (p < end) {
4843 if ((unsigned char)*p & 0x80)
4844 break;
4845 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004850#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851 while (p < end) {
4852 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4853 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004854 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004855 /* Help allocation */
4856 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 while (_p < aligned_end) {
4858 unsigned long value = *(unsigned long *) _p;
4859 if (value & ASCII_CHAR_MASK)
4860 break;
4861 _p += SIZEOF_LONG;
4862 }
4863 p = _p;
4864 if (_p == end)
4865 break;
4866 }
4867 if ((unsigned char)*p & 0x80)
4868 break;
4869 ++p;
4870 }
4871 memcpy(dest, start, p - start);
4872 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873}
Antoine Pitrouab868312009-01-10 15:40:25 +00004874
Victor Stinner709d23d2019-05-02 14:56:30 -04004875static PyObject *
4876unicode_decode_utf8(const char *s, Py_ssize_t size,
4877 _Py_error_handler error_handler, const char *errors,
4878 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004879{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004880 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004881 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883
4884 Py_ssize_t startinpos;
4885 Py_ssize_t endinpos;
4886 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004887 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004889
4890 if (size == 0) {
4891 if (consumed)
4892 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004893 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004894 }
4895
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4897 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004898 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 *consumed = 1;
4900 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004901 }
4902
Victor Stinner8f674cc2013-04-17 23:02:17 +02004903 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004904 writer.min_length = size;
4905 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004906 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004907
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004908 writer.pos = ascii_decode(s, end, writer.data);
4909 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 while (s < end) {
4911 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004912 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004913
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004915 if (PyUnicode_IS_ASCII(writer.buffer))
4916 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004918 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004920 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 } else {
4922 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 }
4925
4926 switch (ch) {
4927 case 0:
4928 if (s == end || consumed)
4929 goto End;
4930 errmsg = "unexpected end of data";
4931 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004932 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004933 break;
4934 case 1:
4935 errmsg = "invalid start byte";
4936 startinpos = s - starts;
4937 endinpos = startinpos + 1;
4938 break;
4939 case 2:
Miss Islington (bot)d32594a2019-06-25 02:12:16 -07004940 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4941 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4942 {
4943 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004944 goto End;
4945 }
Miss Islington (bot)d32594a2019-06-25 02:12:16 -07004946 /* fall through */
4947 case 3:
4948 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 errmsg = "invalid continuation byte";
4950 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004951 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 break;
4953 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004954 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 goto onError;
4956 continue;
4957 }
4958
Victor Stinner1d65d912015-10-05 13:43:50 +02004959 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004960 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004961
4962 switch (error_handler) {
4963 case _Py_ERROR_IGNORE:
4964 s += (endinpos - startinpos);
4965 break;
4966
4967 case _Py_ERROR_REPLACE:
4968 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4969 goto onError;
4970 s += (endinpos - startinpos);
4971 break;
4972
4973 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004974 {
4975 Py_ssize_t i;
4976
Victor Stinner1d65d912015-10-05 13:43:50 +02004977 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4978 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004979 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004980 ch = (Py_UCS4)(unsigned char)(starts[i]);
4981 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4982 ch + 0xdc00);
4983 writer.pos++;
4984 }
4985 s += (endinpos - startinpos);
4986 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004987 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004988
4989 default:
4990 if (unicode_decode_call_errorhandler_writer(
4991 errors, &error_handler_obj,
4992 "utf-8", errmsg,
4993 &starts, &end, &startinpos, &endinpos, &exc, &s,
4994 &writer))
4995 goto onError;
4996 }
Victor Stinner785938e2011-12-11 20:09:03 +01004997 }
4998
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 if (consumed)
5001 *consumed = s - starts;
5002
Victor Stinner1d65d912015-10-05 13:43:50 +02005003 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005005 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006
5007onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005008 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005010 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005012}
5013
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005014
Victor Stinner709d23d2019-05-02 14:56:30 -04005015PyObject *
5016PyUnicode_DecodeUTF8Stateful(const char *s,
5017 Py_ssize_t size,
5018 const char *errors,
5019 Py_ssize_t *consumed)
5020{
5021 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5022}
5023
5024
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005025/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5026 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005027
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005028 On success, write a pointer to a newly allocated wide character string into
5029 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5030 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005031
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005032 On memory allocation failure, return -1.
5033
5034 On decoding error (if surrogateescape is zero), return -2. If wlen is
5035 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5036 is not NULL, write the decoding error message into *reason. */
5037int
5038_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005039 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005040{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005041 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005042 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 wchar_t *unicode;
5044 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005045
Victor Stinner3d4226a2018-08-29 22:21:32 +02005046 int surrogateescape = 0;
5047 int surrogatepass = 0;
5048 switch (errors)
5049 {
5050 case _Py_ERROR_STRICT:
5051 break;
5052 case _Py_ERROR_SURROGATEESCAPE:
5053 surrogateescape = 1;
5054 break;
5055 case _Py_ERROR_SURROGATEPASS:
5056 surrogatepass = 1;
5057 break;
5058 default:
5059 return -3;
5060 }
5061
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005062 /* Note: size will always be longer than the resulting Unicode
5063 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005064 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005065 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005066 }
5067
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005068 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005069 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005070 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005071 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005072
5073 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005076 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005078#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005080#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005082#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 if (ch > 0xFF) {
5084#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005085 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005086#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005087 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005088 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5090 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5091#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005092 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005093 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005094 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005095 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005096 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005097
5098 if (surrogateescape) {
5099 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5100 }
5101 else {
5102 /* Is it a valid three-byte code? */
5103 if (surrogatepass
5104 && (e - s) >= 3
5105 && (s[0] & 0xf0) == 0xe0
5106 && (s[1] & 0xc0) == 0x80
5107 && (s[2] & 0xc0) == 0x80)
5108 {
5109 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5110 s += 3;
5111 unicode[outpos++] = ch;
5112 }
5113 else {
5114 PyMem_RawFree(unicode );
5115 if (reason != NULL) {
5116 switch (ch) {
5117 case 0:
5118 *reason = "unexpected end of data";
5119 break;
5120 case 1:
5121 *reason = "invalid start byte";
5122 break;
5123 /* 2, 3, 4 */
5124 default:
5125 *reason = "invalid continuation byte";
5126 break;
5127 }
5128 }
5129 if (wlen != NULL) {
5130 *wlen = s - orig_s;
5131 }
5132 return -2;
5133 }
5134 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005138 if (wlen) {
5139 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005140 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005141 *wstr = unicode;
5142 return 0;
5143}
5144
Victor Stinner5f9cf232019-03-19 01:46:25 +01005145
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005146wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005147_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5148 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005149{
5150 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005151 int res = _Py_DecodeUTF8Ex(arg, arglen,
5152 &wstr, wlen,
5153 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005154 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005155 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5156 assert(res != -3);
5157 if (wlen) {
5158 *wlen = (size_t)res;
5159 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 return NULL;
5161 }
5162 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005163}
5164
Antoine Pitrouab868312009-01-10 15:40:25 +00005165
Victor Stinnere47e6982017-12-21 15:45:16 +01005166/* UTF-8 encoder using the surrogateescape error handler .
5167
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005168 On success, return 0 and write the newly allocated character string (use
5169 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005170
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005171 On encoding failure, return -2 and write the position of the invalid
5172 surrogate character into *error_pos (if error_pos is set) and the decoding
5173 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005174
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005175 On memory allocation failure, return -1. */
5176int
5177_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005178 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005179{
5180 const Py_ssize_t max_char_size = 4;
5181 Py_ssize_t len = wcslen(text);
5182
5183 assert(len >= 0);
5184
Victor Stinner3d4226a2018-08-29 22:21:32 +02005185 int surrogateescape = 0;
5186 int surrogatepass = 0;
5187 switch (errors)
5188 {
5189 case _Py_ERROR_STRICT:
5190 break;
5191 case _Py_ERROR_SURROGATEESCAPE:
5192 surrogateescape = 1;
5193 break;
5194 case _Py_ERROR_SURROGATEPASS:
5195 surrogatepass = 1;
5196 break;
5197 default:
5198 return -3;
5199 }
5200
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5202 return -1;
5203 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005204 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005205 if (raw_malloc) {
5206 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005207 }
5208 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005209 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005210 }
5211 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005212 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005213 }
5214
5215 char *p = bytes;
5216 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005217 for (i = 0; i < len; ) {
5218 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005219 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005220 i++;
5221#if Py_UNICODE_SIZE == 2
5222 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5223 && i < len
5224 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5225 {
5226 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5227 i++;
5228 }
5229#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005230
5231 if (ch < 0x80) {
5232 /* Encode ASCII */
5233 *p++ = (char) ch;
5234
5235 }
5236 else if (ch < 0x0800) {
5237 /* Encode Latin-1 */
5238 *p++ = (char)(0xc0 | (ch >> 6));
5239 *p++ = (char)(0x80 | (ch & 0x3f));
5240 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005241 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005242 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005244 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005245 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005246 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005247 if (reason != NULL) {
5248 *reason = "encoding error";
5249 }
5250 if (raw_malloc) {
5251 PyMem_RawFree(bytes);
5252 }
5253 else {
5254 PyMem_Free(bytes);
5255 }
5256 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005257 }
5258 *p++ = (char)(ch & 0xff);
5259 }
5260 else if (ch < 0x10000) {
5261 *p++ = (char)(0xe0 | (ch >> 12));
5262 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5263 *p++ = (char)(0x80 | (ch & 0x3f));
5264 }
5265 else { /* ch >= 0x10000 */
5266 assert(ch <= MAX_UNICODE);
5267 /* Encode UCS4 Unicode ordinals */
5268 *p++ = (char)(0xf0 | (ch >> 18));
5269 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5270 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5271 *p++ = (char)(0x80 | (ch & 0x3f));
5272 }
5273 }
5274 *p++ = '\0';
5275
5276 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005277 char *bytes2;
5278 if (raw_malloc) {
5279 bytes2 = PyMem_RawRealloc(bytes, final_size);
5280 }
5281 else {
5282 bytes2 = PyMem_Realloc(bytes, final_size);
5283 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005284 if (bytes2 == NULL) {
5285 if (error_pos != NULL) {
5286 *error_pos = (size_t)-1;
5287 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005288 if (raw_malloc) {
5289 PyMem_RawFree(bytes);
5290 }
5291 else {
5292 PyMem_Free(bytes);
5293 }
5294 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005295 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005296 *str = bytes2;
5297 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005298}
5299
5300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005301/* Primary internal function which creates utf8 encoded bytes objects.
5302
5303 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005304 and allocate exactly as much space needed at the end. Else allocate the
5305 maximum possible needed (4 result bytes per Unicode character), and return
5306 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005307*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005308static PyObject *
5309unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311{
Victor Stinner6099a032011-12-18 14:22:26 +01005312 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005313 void *data;
5314 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005316 if (!PyUnicode_Check(unicode)) {
5317 PyErr_BadArgument();
5318 return NULL;
5319 }
5320
5321 if (PyUnicode_READY(unicode) == -1)
5322 return NULL;
5323
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005324 if (PyUnicode_UTF8(unicode))
5325 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5326 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005327
5328 kind = PyUnicode_KIND(unicode);
5329 data = PyUnicode_DATA(unicode);
5330 size = PyUnicode_GET_LENGTH(unicode);
5331
Benjamin Petersonead6b532011-12-20 17:23:42 -06005332 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005333 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005334 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005335 case PyUnicode_1BYTE_KIND:
5336 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5337 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005338 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005339 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005340 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005341 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005342 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344}
5345
Alexander Belopolsky40018472011-02-26 01:02:56 +00005346PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005347_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5348{
5349 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5350}
5351
5352
5353PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005354PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5355 Py_ssize_t size,
5356 const char *errors)
5357{
5358 PyObject *v, *unicode;
5359
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005360 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005361 if (unicode == NULL)
5362 return NULL;
5363 v = _PyUnicode_AsUTF8String(unicode, errors);
5364 Py_DECREF(unicode);
5365 return v;
5366}
5367
5368PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005369PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372}
5373
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374/* --- UTF-32 Codec ------------------------------------------------------- */
5375
5376PyObject *
5377PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 Py_ssize_t size,
5379 const char *errors,
5380 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005381{
5382 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5383}
5384
5385PyObject *
5386PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 Py_ssize_t size,
5388 const char *errors,
5389 int *byteorder,
5390 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005391{
5392 const char *starts = s;
5393 Py_ssize_t startinpos;
5394 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005395 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005396 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005397 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005400 PyObject *errorHandler = NULL;
5401 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005402
Walter Dörwald41980ca2007-08-16 21:55:45 +00005403 q = (unsigned char *)s;
5404 e = q + size;
5405
5406 if (byteorder)
5407 bo = *byteorder;
5408
5409 /* Check for BOM marks (U+FEFF) in the input and adjust current
5410 byte order setting accordingly. In native mode, the leading BOM
5411 mark is skipped, in all other modes, it is copied to the output
5412 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005413 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005414 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005415 if (bom == 0x0000FEFF) {
5416 bo = -1;
5417 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005419 else if (bom == 0xFFFE0000) {
5420 bo = 1;
5421 q += 4;
5422 }
5423 if (byteorder)
5424 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005425 }
5426
Victor Stinnere64322e2012-10-30 23:12:47 +01005427 if (q == e) {
5428 if (consumed)
5429 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005430 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431 }
5432
Victor Stinnere64322e2012-10-30 23:12:47 +01005433#ifdef WORDS_BIGENDIAN
5434 le = bo < 0;
5435#else
5436 le = bo <= 0;
5437#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005438 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005439
Victor Stinner8f674cc2013-04-17 23:02:17 +02005440 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005441 writer.min_length = (e - q + 3) / 4;
5442 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005443 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005444
Victor Stinnere64322e2012-10-30 23:12:47 +01005445 while (1) {
5446 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005447 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005448
Victor Stinnere64322e2012-10-30 23:12:47 +01005449 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005450 enum PyUnicode_Kind kind = writer.kind;
5451 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005452 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005453 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005454 if (le) {
5455 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005456 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005457 if (ch > maxch)
5458 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 if (kind != PyUnicode_1BYTE_KIND &&
5460 Py_UNICODE_IS_SURROGATE(ch))
5461 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005462 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005463 q += 4;
5464 } while (q <= last);
5465 }
5466 else {
5467 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005468 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005469 if (ch > maxch)
5470 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005471 if (kind != PyUnicode_1BYTE_KIND &&
5472 Py_UNICODE_IS_SURROGATE(ch))
5473 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005474 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005475 q += 4;
5476 } while (q <= last);
5477 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005478 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005479 }
5480
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005482 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005483 startinpos = ((const char *)q) - starts;
5484 endinpos = startinpos + 4;
5485 }
5486 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005487 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005489 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005491 startinpos = ((const char *)q) - starts;
5492 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005494 else {
5495 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005496 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005497 goto onError;
5498 q += 4;
5499 continue;
5500 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005501 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005502 startinpos = ((const char *)q) - starts;
5503 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005505
5506 /* The remaining input chars are ignored if the callback
5507 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005508 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005510 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005512 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005514 }
5515
Walter Dörwald41980ca2007-08-16 21:55:45 +00005516 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518
Walter Dörwald41980ca2007-08-16 21:55:45 +00005519 Py_XDECREF(errorHandler);
5520 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005521 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005522
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005524 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005525 Py_XDECREF(errorHandler);
5526 Py_XDECREF(exc);
5527 return NULL;
5528}
5529
5530PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005531_PyUnicode_EncodeUTF32(PyObject *str,
5532 const char *errors,
5533 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005534{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005535 enum PyUnicode_Kind kind;
5536 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005538 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005539 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005540#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005541 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005542#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005543 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005544#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005545 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005546 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005547 PyObject *errorHandler = NULL;
5548 PyObject *exc = NULL;
5549 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005550
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005551 if (!PyUnicode_Check(str)) {
5552 PyErr_BadArgument();
5553 return NULL;
5554 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005555 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005556 return NULL;
5557 kind = PyUnicode_KIND(str);
5558 data = PyUnicode_DATA(str);
5559 len = PyUnicode_GET_LENGTH(str);
5560
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005561 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005562 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005563 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005564 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005565 if (v == NULL)
5566 return NULL;
5567
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005568 /* output buffer is 4-bytes aligned */
5569 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005570 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005571 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005572 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005573 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005574 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005575
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005576 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005577 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005578 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005579 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 else
5581 encoding = "utf-32";
5582
5583 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005584 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5585 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005586 }
5587
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005588 pos = 0;
5589 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005590 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005591
5592 if (kind == PyUnicode_2BYTE_KIND) {
5593 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5594 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005595 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005596 else {
5597 assert(kind == PyUnicode_4BYTE_KIND);
5598 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5599 &out, native_ordering);
5600 }
5601 if (pos == len)
5602 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005603
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005604 rep = unicode_encode_call_errorhandler(
5605 errors, &errorHandler,
5606 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005607 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005608 if (!rep)
5609 goto error;
5610
5611 if (PyBytes_Check(rep)) {
5612 repsize = PyBytes_GET_SIZE(rep);
5613 if (repsize & 3) {
5614 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005615 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005616 "surrogates not allowed");
5617 goto error;
5618 }
5619 moreunits = repsize / 4;
5620 }
5621 else {
5622 assert(PyUnicode_Check(rep));
5623 if (PyUnicode_READY(rep) < 0)
5624 goto error;
5625 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5626 if (!PyUnicode_IS_ASCII(rep)) {
5627 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005628 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005629 "surrogates not allowed");
5630 goto error;
5631 }
5632 }
5633
5634 /* four bytes are reserved for each surrogate */
5635 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005636 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005637 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005638 /* integer overflow */
5639 PyErr_NoMemory();
5640 goto error;
5641 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005642 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005643 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005644 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005645 }
5646
5647 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005648 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005649 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005652 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5653 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005654 }
5655
5656 Py_CLEAR(rep);
5657 }
5658
5659 /* Cut back to size actually needed. This is necessary for, for example,
5660 encoding of a string containing isolated surrogates and the 'ignore'
5661 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005662 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005663 if (nsize != PyBytes_GET_SIZE(v))
5664 _PyBytes_Resize(&v, nsize);
5665 Py_XDECREF(errorHandler);
5666 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005667 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005668 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005669 error:
5670 Py_XDECREF(rep);
5671 Py_XDECREF(errorHandler);
5672 Py_XDECREF(exc);
5673 Py_XDECREF(v);
5674 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005675}
5676
Alexander Belopolsky40018472011-02-26 01:02:56 +00005677PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005678PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5679 Py_ssize_t size,
5680 const char *errors,
5681 int byteorder)
5682{
5683 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005684 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005685 if (tmp == NULL)
5686 return NULL;
5687 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5688 Py_DECREF(tmp);
5689 return result;
5690}
5691
5692PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005693PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005694{
Victor Stinnerb960b342011-11-20 19:12:52 +01005695 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005696}
5697
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698/* --- UTF-16 Codec ------------------------------------------------------- */
5699
Tim Peters772747b2001-08-09 22:21:55 +00005700PyObject *
5701PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 Py_ssize_t size,
5703 const char *errors,
5704 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705{
Walter Dörwald69652032004-09-07 20:24:22 +00005706 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5707}
5708
5709PyObject *
5710PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 Py_ssize_t size,
5712 const char *errors,
5713 int *byteorder,
5714 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005715{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 Py_ssize_t startinpos;
5718 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005719 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005720 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005721 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005722 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005723 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 PyObject *errorHandler = NULL;
5725 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
Tim Peters772747b2001-08-09 22:21:55 +00005728 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005729 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
5731 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005732 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005734 /* Check for BOM marks (U+FEFF) in the input and adjust current
5735 byte order setting accordingly. In native mode, the leading BOM
5736 mark is skipped, in all other modes, it is copied to the output
5737 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005738 if (bo == 0 && size >= 2) {
5739 const Py_UCS4 bom = (q[1] << 8) | q[0];
5740 if (bom == 0xFEFF) {
5741 q += 2;
5742 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005744 else if (bom == 0xFFFE) {
5745 q += 2;
5746 bo = 1;
5747 }
5748 if (byteorder)
5749 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
Antoine Pitrou63065d72012-05-15 23:48:04 +02005752 if (q == e) {
5753 if (consumed)
5754 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005755 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005756 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005757
Christian Heimes743e0cd2012-10-17 23:52:17 +02005758#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005759 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005761#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005762 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005764#endif
Tim Peters772747b2001-08-09 22:21:55 +00005765
Antoine Pitrou63065d72012-05-15 23:48:04 +02005766 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005767 character count normally. Error handler will take care of
5768 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005769 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005770 writer.min_length = (e - q + 1) / 2;
5771 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005773
Antoine Pitrou63065d72012-05-15 23:48:04 +02005774 while (1) {
5775 Py_UCS4 ch = 0;
5776 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005778 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005779 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005780 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005781 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005782 native_ordering);
5783 else
5784 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005786 native_ordering);
5787 } else if (kind == PyUnicode_2BYTE_KIND) {
5788 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005790 native_ordering);
5791 } else {
5792 assert(kind == PyUnicode_4BYTE_KIND);
5793 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005794 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005795 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005796 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005797 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798
Antoine Pitrou63065d72012-05-15 23:48:04 +02005799 switch (ch)
5800 {
5801 case 0:
5802 /* remaining byte at the end? (size should be even) */
5803 if (q == e || consumed)
5804 goto End;
5805 errmsg = "truncated data";
5806 startinpos = ((const char *)q) - starts;
5807 endinpos = ((const char *)e) - starts;
5808 break;
5809 /* The remaining input chars are ignored if the callback
5810 chooses to skip the input */
5811 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005812 q -= 2;
5813 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005814 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005815 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005816 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005817 endinpos = ((const char *)e) - starts;
5818 break;
5819 case 2:
5820 errmsg = "illegal encoding";
5821 startinpos = ((const char *)q) - 2 - starts;
5822 endinpos = startinpos + 2;
5823 break;
5824 case 3:
5825 errmsg = "illegal UTF-16 surrogate";
5826 startinpos = ((const char *)q) - 4 - starts;
5827 endinpos = startinpos + 2;
5828 break;
5829 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005830 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005831 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 continue;
5833 }
5834
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005835 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005836 errors,
5837 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005839 &starts,
5840 (const char **)&e,
5841 &startinpos,
5842 &endinpos,
5843 &exc,
5844 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 }
5848
Antoine Pitrou63065d72012-05-15 23:48:04 +02005849End:
Walter Dörwald69652032004-09-07 20:24:22 +00005850 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 Py_XDECREF(errorHandler);
5854 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 return NULL;
5862}
5863
Tim Peters772747b2001-08-09 22:21:55 +00005864PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865_PyUnicode_EncodeUTF16(PyObject *str,
5866 const char *errors,
5867 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005869 enum PyUnicode_Kind kind;
5870 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005873 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005874 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005875#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005876 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005877#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005878 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005879#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005880 const char *encoding;
5881 Py_ssize_t nsize, pos;
5882 PyObject *errorHandler = NULL;
5883 PyObject *exc = NULL;
5884 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005885
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886 if (!PyUnicode_Check(str)) {
5887 PyErr_BadArgument();
5888 return NULL;
5889 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005890 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 return NULL;
5892 kind = PyUnicode_KIND(str);
5893 data = PyUnicode_DATA(str);
5894 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005895
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005897 if (kind == PyUnicode_4BYTE_KIND) {
5898 const Py_UCS4 *in = (const Py_UCS4 *)data;
5899 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005900 while (in < end) {
5901 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005903 }
5904 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005905 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005906 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005908 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005909 nsize = len + pairs + (byteorder == 0);
5910 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005911 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005915 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005916 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005917 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005918 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005919 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005920 }
5921 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005922 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005923 }
Tim Peters772747b2001-08-09 22:21:55 +00005924
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005925 if (kind == PyUnicode_1BYTE_KIND) {
5926 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5927 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005928 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005929
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005930 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005931 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005932 }
5933 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005934 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005935 }
5936 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005937 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005938 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005939
5940 pos = 0;
5941 while (pos < len) {
5942 Py_ssize_t repsize, moreunits;
5943
5944 if (kind == PyUnicode_2BYTE_KIND) {
5945 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5946 &out, native_ordering);
5947 }
5948 else {
5949 assert(kind == PyUnicode_4BYTE_KIND);
5950 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5951 &out, native_ordering);
5952 }
5953 if (pos == len)
5954 break;
5955
5956 rep = unicode_encode_call_errorhandler(
5957 errors, &errorHandler,
5958 encoding, "surrogates not allowed",
5959 str, &exc, pos, pos + 1, &pos);
5960 if (!rep)
5961 goto error;
5962
5963 if (PyBytes_Check(rep)) {
5964 repsize = PyBytes_GET_SIZE(rep);
5965 if (repsize & 1) {
5966 raise_encode_exception(&exc, encoding,
5967 str, pos - 1, pos,
5968 "surrogates not allowed");
5969 goto error;
5970 }
5971 moreunits = repsize / 2;
5972 }
5973 else {
5974 assert(PyUnicode_Check(rep));
5975 if (PyUnicode_READY(rep) < 0)
5976 goto error;
5977 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5978 if (!PyUnicode_IS_ASCII(rep)) {
5979 raise_encode_exception(&exc, encoding,
5980 str, pos - 1, pos,
5981 "surrogates not allowed");
5982 goto error;
5983 }
5984 }
5985
5986 /* two bytes are reserved for each surrogate */
5987 if (moreunits > 1) {
5988 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005989 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005990 /* integer overflow */
5991 PyErr_NoMemory();
5992 goto error;
5993 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005994 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005995 goto error;
5996 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5997 }
5998
5999 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006000 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 out += moreunits;
6002 } else /* rep is unicode */ {
6003 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6004 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6005 &out, native_ordering);
6006 }
6007
6008 Py_CLEAR(rep);
6009 }
6010
6011 /* Cut back to size actually needed. This is necessary for, for example,
6012 encoding of a string containing isolated surrogates and the 'ignore' handler
6013 is used. */
6014 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6015 if (nsize != PyBytes_GET_SIZE(v))
6016 _PyBytes_Resize(&v, nsize);
6017 Py_XDECREF(errorHandler);
6018 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006019 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006020 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006021 error:
6022 Py_XDECREF(rep);
6023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
6025 Py_XDECREF(v);
6026 return NULL;
6027#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6032 Py_ssize_t size,
6033 const char *errors,
6034 int byteorder)
6035{
6036 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006037 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 if (tmp == NULL)
6039 return NULL;
6040 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6041 Py_DECREF(tmp);
6042 return result;
6043}
6044
6045PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006046PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006048 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049}
6050
6051/* --- Unicode Escape Codec ----------------------------------------------- */
6052
Fredrik Lundh06d12682001-01-24 07:59:11 +00006053static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006054
Alexander Belopolsky40018472011-02-26 01:02:56 +00006055PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006056_PyUnicode_DecodeUnicodeEscape(const char *s,
6057 Py_ssize_t size,
6058 const char *errors,
6059 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006062 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 PyObject *errorHandler = NULL;
6065 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066
Eric V. Smith42454af2016-10-31 09:22:08 -04006067 // so we can remember if we've seen an invalid escape char or not
6068 *first_invalid_escape = NULL;
6069
Victor Stinner62ec3312016-09-06 17:04:34 -07006070 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006071 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006072 }
6073 /* Escaped strings will always be longer than the resulting
6074 Unicode string, so we start with size here and then reduce the
6075 length after conversion to the true value.
6076 (but if the error callback returns a long replacement string
6077 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006078 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006079 writer.min_length = size;
6080 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6081 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006082 }
6083
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 end = s + size;
6085 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006086 unsigned char c = (unsigned char) *s++;
6087 Py_UCS4 ch;
6088 int count;
6089 Py_ssize_t startinpos;
6090 Py_ssize_t endinpos;
6091 const char *message;
6092
6093#define WRITE_ASCII_CHAR(ch) \
6094 do { \
6095 assert(ch <= 127); \
6096 assert(writer.pos < writer.size); \
6097 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6098 } while(0)
6099
6100#define WRITE_CHAR(ch) \
6101 do { \
6102 if (ch <= writer.maxchar) { \
6103 assert(writer.pos < writer.size); \
6104 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6105 } \
6106 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6107 goto onError; \
6108 } \
6109 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006112 if (c != '\\') {
6113 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 continue;
6115 }
6116
Victor Stinner62ec3312016-09-06 17:04:34 -07006117 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006119 if (s >= end) {
6120 message = "\\ at end of string";
6121 goto error;
6122 }
6123 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006124
Victor Stinner62ec3312016-09-06 17:04:34 -07006125 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006126 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006129 case '\n': continue;
6130 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6131 case '\'': WRITE_ASCII_CHAR('\''); continue;
6132 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6133 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006134 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006135 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6136 case 't': WRITE_ASCII_CHAR('\t'); continue;
6137 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6138 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006139 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006140 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006141 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 case '0': case '1': case '2': case '3':
6146 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006148 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 ch = (ch<<3) + *s++ - '0';
6150 if (s < end && '0' <= *s && *s <= '7') {
6151 ch = (ch<<3) + *s++ - '0';
6152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006154 WRITE_CHAR(ch);
6155 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 /* hex escapes */
6158 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006161 message = "truncated \\xXX escape";
6162 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006166 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006167 message = "truncated \\uXXXX escape";
6168 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006171 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006173 message = "truncated \\UXXXXXXXX escape";
6174 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006175 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006176 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 ch <<= 4;
6178 if (c >= '0' && c <= '9') {
6179 ch += c - '0';
6180 }
6181 else if (c >= 'a' && c <= 'f') {
6182 ch += c - ('a' - 10);
6183 }
6184 else if (c >= 'A' && c <= 'F') {
6185 ch += c - ('A' - 10);
6186 }
6187 else {
6188 break;
6189 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006190 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006191 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006192 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 }
6194
6195 /* when we get here, ch is a 32-bit unicode character */
6196 if (ch > MAX_UNICODE) {
6197 message = "illegal Unicode character";
6198 goto error;
6199 }
6200
6201 WRITE_CHAR(ch);
6202 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006205 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006206 if (ucnhash_CAPI == NULL) {
6207 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006208 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6209 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 if (ucnhash_CAPI == NULL) {
6211 PyErr_SetString(
6212 PyExc_UnicodeError,
6213 "\\N escapes not supported (can't load unicodedata module)"
6214 );
6215 goto onError;
6216 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006217 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006218
6219 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006220 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 const char *start = ++s;
6222 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006223 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006225 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 namelen = s - start;
6227 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006228 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006229 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 ch = 0xffffffff; /* in case 'getcode' messes up */
6231 if (namelen <= INT_MAX &&
6232 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6233 &ch, 0)) {
6234 assert(ch <= MAX_UNICODE);
6235 WRITE_CHAR(ch);
6236 continue;
6237 }
6238 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006239 }
6240 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006241 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006242
6243 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006244 if (*first_invalid_escape == NULL) {
6245 *first_invalid_escape = s-1; /* Back up one char, since we've
6246 already incremented s. */
6247 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 WRITE_ASCII_CHAR('\\');
6249 WRITE_CHAR(c);
6250 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006252
6253 error:
6254 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006255 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006256 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006257 errors, &errorHandler,
6258 "unicodeescape", message,
6259 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006261 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006262 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006263 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006264
6265#undef WRITE_ASCII_CHAR
6266#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006268
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006269 Py_XDECREF(errorHandler);
6270 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006271 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006272
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006274 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006275 Py_XDECREF(errorHandler);
6276 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 return NULL;
6278}
6279
Eric V. Smith42454af2016-10-31 09:22:08 -04006280PyObject *
6281PyUnicode_DecodeUnicodeEscape(const char *s,
6282 Py_ssize_t size,
6283 const char *errors)
6284{
6285 const char *first_invalid_escape;
6286 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6287 &first_invalid_escape);
6288 if (result == NULL)
6289 return NULL;
6290 if (first_invalid_escape != NULL) {
6291 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6292 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006293 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006294 Py_DECREF(result);
6295 return NULL;
6296 }
6297 }
6298 return result;
6299}
6300
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006301/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
Alexander Belopolsky40018472011-02-26 01:02:56 +00006303PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006304PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006306 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006309 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006310 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
Ezio Melottie7f90372012-10-05 03:33:31 +03006313 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006314 escape.
6315
Ezio Melottie7f90372012-10-05 03:33:31 +03006316 For UCS1 strings it's '\xxx', 4 bytes per source character.
6317 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6318 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006319 */
6320
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006321 if (!PyUnicode_Check(unicode)) {
6322 PyErr_BadArgument();
6323 return NULL;
6324 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006326 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006327 }
Victor Stinner358af132015-10-12 22:36:57 +02006328
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006329 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 if (len == 0) {
6331 return PyBytes_FromStringAndSize(NULL, 0);
6332 }
6333
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006334 kind = PyUnicode_KIND(unicode);
6335 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6337 bytes, and 1 byte characters 4. */
6338 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006339 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 return PyErr_NoMemory();
6341 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006342 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 if (repr == NULL) {
6344 return NULL;
6345 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006346
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006348 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006349 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006350
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 /* U+0000-U+00ff range */
6352 if (ch < 0x100) {
6353 if (ch >= ' ' && ch < 127) {
6354 if (ch != '\\') {
6355 /* Copy printable US ASCII as-is */
6356 *p++ = (char) ch;
6357 }
6358 /* Escape backslashes */
6359 else {
6360 *p++ = '\\';
6361 *p++ = '\\';
6362 }
6363 }
Victor Stinner358af132015-10-12 22:36:57 +02006364
Victor Stinner62ec3312016-09-06 17:04:34 -07006365 /* Map special whitespace to '\t', \n', '\r' */
6366 else if (ch == '\t') {
6367 *p++ = '\\';
6368 *p++ = 't';
6369 }
6370 else if (ch == '\n') {
6371 *p++ = '\\';
6372 *p++ = 'n';
6373 }
6374 else if (ch == '\r') {
6375 *p++ = '\\';
6376 *p++ = 'r';
6377 }
6378
6379 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6380 else {
6381 *p++ = '\\';
6382 *p++ = 'x';
6383 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6384 *p++ = Py_hexdigits[ch & 0x000F];
6385 }
Tim Petersced69f82003-09-16 20:30:58 +00006386 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006387 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006388 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 *p++ = '\\';
6390 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006391 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6392 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6393 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6394 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6397 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006398
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 /* Make sure that the first two digits are zero */
6400 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006401 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 *p++ = 'U';
6403 *p++ = '0';
6404 *p++ = '0';
6405 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6406 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6407 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6408 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6409 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6410 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 assert(p - PyBytes_AS_STRING(repr) > 0);
6415 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6416 return NULL;
6417 }
6418 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419}
6420
Alexander Belopolsky40018472011-02-26 01:02:56 +00006421PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006422PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6423 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006426 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 }
6430
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006431 result = PyUnicode_AsUnicodeEscapeString(tmp);
6432 Py_DECREF(tmp);
6433 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434}
6435
6436/* --- Raw Unicode Escape Codec ------------------------------------------- */
6437
Alexander Belopolsky40018472011-02-26 01:02:56 +00006438PyObject *
6439PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006440 Py_ssize_t size,
6441 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006444 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 PyObject *errorHandler = NULL;
6447 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006448
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006450 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 /* Escaped strings will always be longer than the resulting
6454 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 length after conversion to the true value. (But decoding error
6456 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006457 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 writer.min_length = size;
6459 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6460 goto onError;
6461 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006462
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 end = s + size;
6464 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 unsigned char c = (unsigned char) *s++;
6466 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006467 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 Py_ssize_t startinpos;
6469 Py_ssize_t endinpos;
6470 const char *message;
6471
6472#define WRITE_CHAR(ch) \
6473 do { \
6474 if (ch <= writer.maxchar) { \
6475 assert(writer.pos < writer.size); \
6476 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6477 } \
6478 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6479 goto onError; \
6480 } \
6481 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 if (c != '\\' || s >= end) {
6485 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006488
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 c = (unsigned char) *s++;
6490 if (c == 'u') {
6491 count = 4;
6492 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006494 else if (c == 'U') {
6495 count = 8;
6496 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006497 }
6498 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 assert(writer.pos < writer.size);
6500 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6501 WRITE_CHAR(c);
6502 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006503 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 startinpos = s - starts - 2;
6505
6506 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6507 for (ch = 0; count && s < end; ++s, --count) {
6508 c = (unsigned char)*s;
6509 ch <<= 4;
6510 if (c >= '0' && c <= '9') {
6511 ch += c - '0';
6512 }
6513 else if (c >= 'a' && c <= 'f') {
6514 ch += c - ('a' - 10);
6515 }
6516 else if (c >= 'A' && c <= 'F') {
6517 ch += c - ('A' - 10);
6518 }
6519 else {
6520 break;
6521 }
6522 }
6523 if (!count) {
6524 if (ch <= MAX_UNICODE) {
6525 WRITE_CHAR(ch);
6526 continue;
6527 }
6528 message = "\\Uxxxxxxxx out of range";
6529 }
6530
6531 endinpos = s-starts;
6532 writer.min_length = end - s + writer.pos;
6533 if (unicode_decode_call_errorhandler_writer(
6534 errors, &errorHandler,
6535 "rawunicodeescape", message,
6536 &starts, &end, &startinpos, &endinpos, &exc, &s,
6537 &writer)) {
6538 goto onError;
6539 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006540 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006541
6542#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 Py_XDECREF(errorHandler);
6545 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006546 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006547
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006549 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 Py_XDECREF(errorHandler);
6551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554}
6555
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006556
Alexander Belopolsky40018472011-02-26 01:02:56 +00006557PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006558PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
Victor Stinner62ec3312016-09-06 17:04:34 -07006560 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006562 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006563 int kind;
6564 void *data;
6565 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006567 if (!PyUnicode_Check(unicode)) {
6568 PyErr_BadArgument();
6569 return NULL;
6570 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006571 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006572 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006573 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006574 kind = PyUnicode_KIND(unicode);
6575 data = PyUnicode_DATA(unicode);
6576 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 if (kind == PyUnicode_1BYTE_KIND) {
6578 return PyBytes_FromStringAndSize(data, len);
6579 }
Victor Stinner0e368262011-11-10 20:12:49 +01006580
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6582 bytes, and 1 byte characters 4. */
6583 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006584
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 if (len > PY_SSIZE_T_MAX / expandsize) {
6586 return PyErr_NoMemory();
6587 }
6588 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6589 if (repr == NULL) {
6590 return NULL;
6591 }
6592 if (len == 0) {
6593 return repr;
6594 }
6595
6596 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006597 for (pos = 0; pos < len; pos++) {
6598 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006599
Victor Stinner62ec3312016-09-06 17:04:34 -07006600 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6601 if (ch < 0x100) {
6602 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006603 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006604 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006605 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 *p++ = '\\';
6607 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006608 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6609 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6610 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6611 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006613 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6614 else {
6615 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6616 *p++ = '\\';
6617 *p++ = 'U';
6618 *p++ = '0';
6619 *p++ = '0';
6620 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6621 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6622 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6623 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6624 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6625 *p++ = Py_hexdigits[ch & 15];
6626 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006628
Victor Stinner62ec3312016-09-06 17:04:34 -07006629 assert(p > PyBytes_AS_STRING(repr));
6630 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6631 return NULL;
6632 }
6633 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634}
6635
Alexander Belopolsky40018472011-02-26 01:02:56 +00006636PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006637PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006640 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006641 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006642 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006643 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6645 Py_DECREF(tmp);
6646 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
6649/* --- Latin-1 Codec ------------------------------------------------------ */
6650
Alexander Belopolsky40018472011-02-26 01:02:56 +00006651PyObject *
6652PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006653 Py_ssize_t size,
6654 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006657 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658}
6659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006660/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006661static void
6662make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006663 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006664 PyObject *unicode,
6665 Py_ssize_t startpos, Py_ssize_t endpos,
6666 const char *reason)
6667{
6668 if (*exceptionObject == NULL) {
6669 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006671 encoding, unicode, startpos, endpos, reason);
6672 }
6673 else {
6674 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6675 goto onError;
6676 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6677 goto onError;
6678 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6679 goto onError;
6680 return;
6681 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006682 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006683 }
6684}
6685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006687static void
6688raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006689 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006690 PyObject *unicode,
6691 Py_ssize_t startpos, Py_ssize_t endpos,
6692 const char *reason)
6693{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006694 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006695 encoding, unicode, startpos, endpos, reason);
6696 if (*exceptionObject != NULL)
6697 PyCodec_StrictErrors(*exceptionObject);
6698}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699
6700/* error handling callback helper:
6701 build arguments, call the callback and check the arguments,
6702 put the result into newpos and return the replacement string, which
6703 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006704static PyObject *
6705unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006706 PyObject **errorHandler,
6707 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006708 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006709 Py_ssize_t startpos, Py_ssize_t endpos,
6710 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006712 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006713 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 PyObject *restuple;
6715 PyObject *resunicode;
6716
6717 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 }
6722
Benjamin Petersonbac79492012-01-14 13:34:47 -05006723 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724 return NULL;
6725 len = PyUnicode_GET_LENGTH(unicode);
6726
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006727 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006732 restuple = PyObject_CallFunctionObjArgs(
6733 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006737 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 Py_DECREF(restuple);
6739 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006741 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 &resunicode, newpos)) {
6743 Py_DECREF(restuple);
6744 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006745 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006746 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6747 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6748 Py_DECREF(restuple);
6749 return NULL;
6750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 *newpos = len + *newpos;
6753 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006754 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 Py_DECREF(restuple);
6756 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 Py_INCREF(resunicode);
6759 Py_DECREF(restuple);
6760 return resunicode;
6761}
6762
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006765 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006766 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006767{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 /* input state */
6769 Py_ssize_t pos=0, size;
6770 int kind;
6771 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006772 /* pointer into the output */
6773 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006774 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6775 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006776 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006778 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006779 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006780 /* output object */
6781 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006782
Benjamin Petersonbac79492012-01-14 13:34:47 -05006783 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 return NULL;
6785 size = PyUnicode_GET_LENGTH(unicode);
6786 kind = PyUnicode_KIND(unicode);
6787 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006788 /* allocate enough for a simple encoding without
6789 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006790 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006791 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006792
6793 _PyBytesWriter_Init(&writer);
6794 str = _PyBytesWriter_Alloc(&writer, size);
6795 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006796 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006797
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006799 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006800
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006802 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006804 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006806 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006808 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006810 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006811 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006813
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006814 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006816
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006817 /* Only overallocate the buffer if it's not the last write */
6818 writer.overallocate = (collend < size);
6819
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006821 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006822 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006823
6824 switch (error_handler) {
6825 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006826 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006828
6829 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006830 memset(str, '?', collend - collstart);
6831 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006832 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006833 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006834 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 break;
Victor Stinner50149202015-09-22 00:26:54 +02006836
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006837 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006838 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006839 writer.min_size -= (collend - collstart);
6840 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006841 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006842 if (str == NULL)
6843 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006844 pos = collend;
6845 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006846
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006847 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006848 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006849 writer.min_size -= (collend - collstart);
6850 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006851 unicode, collstart, collend);
6852 if (str == NULL)
6853 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006854 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 break;
Victor Stinner50149202015-09-22 00:26:54 +02006856
Victor Stinnerc3713e92015-09-29 12:32:13 +02006857 case _Py_ERROR_SURROGATEESCAPE:
6858 for (i = collstart; i < collend; ++i) {
6859 ch = PyUnicode_READ(kind, data, i);
6860 if (ch < 0xdc80 || 0xdcff < ch) {
6861 /* Not a UTF-8b surrogate */
6862 break;
6863 }
6864 *str++ = (char)(ch - 0xdc00);
6865 ++pos;
6866 }
6867 if (i >= collend)
6868 break;
6869 collstart = pos;
6870 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006871 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006872
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006874 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6875 encoding, reason, unicode, &exc,
6876 collstart, collend, &newpos);
6877 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006879
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006880 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006881 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006882
Victor Stinner6bd525b2015-10-09 13:10:05 +02006883 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006884 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006885 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006886 PyBytes_AS_STRING(rep),
6887 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006888 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006889 else {
6890 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006891
Victor Stinner6bd525b2015-10-09 13:10:05 +02006892 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006894
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006895 if (limit == 256 ?
6896 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6897 !PyUnicode_IS_ASCII(rep))
6898 {
6899 /* Not all characters are smaller than limit */
6900 raise_encode_exception(&exc, encoding, unicode,
6901 collstart, collend, reason);
6902 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006904 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6905 str = _PyBytesWriter_WriteBytes(&writer, str,
6906 PyUnicode_DATA(rep),
6907 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006909 if (str == NULL)
6910 goto onError;
6911
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006912 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006913 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006914 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006915
6916 /* If overallocation was disabled, ensure that it was the last
6917 write. Otherwise, we missed an optimization */
6918 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006919 }
6920 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006921
Victor Stinner50149202015-09-22 00:26:54 +02006922 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006924 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006925
6926 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006927 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006928 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006929 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006930 Py_XDECREF(exc);
6931 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006932}
6933
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006934/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006935PyObject *
6936PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006937 Py_ssize_t size,
6938 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006940 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006941 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006942 if (unicode == NULL)
6943 return NULL;
6944 result = unicode_encode_ucs1(unicode, errors, 256);
6945 Py_DECREF(unicode);
6946 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947}
6948
Alexander Belopolsky40018472011-02-26 01:02:56 +00006949PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006950_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951{
6952 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 PyErr_BadArgument();
6954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006956 if (PyUnicode_READY(unicode) == -1)
6957 return NULL;
6958 /* Fast path: if it is a one-byte string, construct
6959 bytes object directly. */
6960 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6961 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6962 PyUnicode_GET_LENGTH(unicode));
6963 /* Non-Latin-1 characters present. Defer to above function to
6964 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006965 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006966}
6967
6968PyObject*
6969PyUnicode_AsLatin1String(PyObject *unicode)
6970{
6971 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
6974/* --- 7-bit ASCII Codec -------------------------------------------------- */
6975
Alexander Belopolsky40018472011-02-26 01:02:56 +00006976PyObject *
6977PyUnicode_DecodeASCII(const char *s,
6978 Py_ssize_t size,
6979 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006981 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006982 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006983 int kind;
6984 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006985 Py_ssize_t startinpos;
6986 Py_ssize_t endinpos;
6987 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006989 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006990 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006991 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006992
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006994 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006995
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006997 if (size == 1 && (unsigned char)s[0] < 128)
6998 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006999
Victor Stinner8f674cc2013-04-17 23:02:17 +02007000 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007001 writer.min_length = size;
7002 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02007003 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007005 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007006 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007007 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 writer.pos = outpos;
7009 if (writer.pos == size)
7010 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007011
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 s += writer.pos;
7013 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007015 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007017 PyUnicode_WRITE(kind, data, writer.pos, c);
7018 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007020 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007022
7023 /* byte outsize range 0x00..0x7f: call the error handler */
7024
7025 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007026 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007027
7028 switch (error_handler)
7029 {
7030 case _Py_ERROR_REPLACE:
7031 case _Py_ERROR_SURROGATEESCAPE:
7032 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007033 but we may switch to UCS2 at the first write */
7034 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7035 goto onError;
7036 kind = writer.kind;
7037 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007038
7039 if (error_handler == _Py_ERROR_REPLACE)
7040 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7041 else
7042 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7043 writer.pos++;
7044 ++s;
7045 break;
7046
7047 case _Py_ERROR_IGNORE:
7048 ++s;
7049 break;
7050
7051 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 startinpos = s-starts;
7053 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007054 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007055 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 "ascii", "ordinal not in range(128)",
7057 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007058 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007060 kind = writer.kind;
7061 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007064 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007066 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007067
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007069 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007070 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 return NULL;
7073}
7074
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007075/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007076PyObject *
7077PyUnicode_EncodeASCII(const Py_UNICODE *p,
7078 Py_ssize_t size,
7079 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007081 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007082 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007083 if (unicode == NULL)
7084 return NULL;
7085 result = unicode_encode_ucs1(unicode, errors, 128);
7086 Py_DECREF(unicode);
7087 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088}
7089
Alexander Belopolsky40018472011-02-26 01:02:56 +00007090PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007091_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092{
7093 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 PyErr_BadArgument();
7095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007097 if (PyUnicode_READY(unicode) == -1)
7098 return NULL;
7099 /* Fast path: if it is an ASCII-only string, construct bytes object
7100 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007101 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007102 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7103 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007104 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007105}
7106
7107PyObject *
7108PyUnicode_AsASCIIString(PyObject *unicode)
7109{
7110 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111}
7112
Steve Dowercc16be82016-09-08 10:35:16 -07007113#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007114
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007115/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007116
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007117#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118#define NEED_RETRY
7119#endif
7120
Victor Stinner3a50e702011-10-18 21:21:00 +02007121#ifndef WC_ERR_INVALID_CHARS
7122# define WC_ERR_INVALID_CHARS 0x0080
7123#endif
7124
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007125static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007126code_page_name(UINT code_page, PyObject **obj)
7127{
7128 *obj = NULL;
7129 if (code_page == CP_ACP)
7130 return "mbcs";
7131 if (code_page == CP_UTF7)
7132 return "CP_UTF7";
7133 if (code_page == CP_UTF8)
7134 return "CP_UTF8";
7135
7136 *obj = PyBytes_FromFormat("cp%u", code_page);
7137 if (*obj == NULL)
7138 return NULL;
7139 return PyBytes_AS_STRING(*obj);
7140}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007141
Victor Stinner3a50e702011-10-18 21:21:00 +02007142static DWORD
7143decode_code_page_flags(UINT code_page)
7144{
7145 if (code_page == CP_UTF7) {
7146 /* The CP_UTF7 decoder only supports flags=0 */
7147 return 0;
7148 }
7149 else
7150 return MB_ERR_INVALID_CHARS;
7151}
7152
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007153/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007154 * Decode a byte string from a Windows code page into unicode object in strict
7155 * mode.
7156 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007157 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7158 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007159 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007160static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007161decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007162 wchar_t **buf,
7163 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007164 const char *in,
7165 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007167 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007168 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007169 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007170
7171 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007172 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007173 while ((outsize = MultiByteToWideChar(code_page, flags,
7174 in, insize, NULL, 0)) <= 0)
7175 {
7176 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7177 goto error;
7178 }
7179 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7180 flags = 0;
7181 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007182
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007183 /* Extend a wchar_t* buffer */
7184 Py_ssize_t n = *bufsize; /* Get the current length */
7185 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7186 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007187 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007188 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007189
7190 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007191 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7192 if (outsize <= 0)
7193 goto error;
7194 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007195
Victor Stinner3a50e702011-10-18 21:21:00 +02007196error:
7197 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7198 return -2;
7199 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007200 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007201}
7202
Victor Stinner3a50e702011-10-18 21:21:00 +02007203/*
7204 * Decode a byte string from a code page into unicode object with an error
7205 * handler.
7206 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007207 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007208 * UnicodeDecodeError exception and returns -1 on error.
7209 */
7210static int
7211decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007212 wchar_t **buf,
7213 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007214 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007215 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007216{
7217 const char *startin = in;
7218 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007219 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007220 /* Ideally, we should get reason from FormatMessage. This is the Windows
7221 2000 English version of the message. */
7222 const char *reason = "No mapping for the Unicode character exists "
7223 "in the target code page.";
7224 /* each step cannot decode more than 1 character, but a character can be
7225 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007226 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007227 int insize;
7228 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 PyObject *errorHandler = NULL;
7230 PyObject *exc = NULL;
7231 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007232 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007233 DWORD err;
7234 int ret = -1;
7235
7236 assert(size > 0);
7237
7238 encoding = code_page_name(code_page, &encoding_obj);
7239 if (encoding == NULL)
7240 return -1;
7241
Victor Stinner7d00cc12014-03-17 23:08:06 +01007242 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007243 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7244 UnicodeDecodeError. */
7245 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7246 if (exc != NULL) {
7247 PyCodec_StrictErrors(exc);
7248 Py_CLEAR(exc);
7249 }
7250 goto error;
7251 }
7252
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007253 /* Extend a wchar_t* buffer */
7254 Py_ssize_t n = *bufsize; /* Get the current length */
7255 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7256 PyErr_NoMemory();
7257 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007258 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007259 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7260 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007262 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007263
7264 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007265 while (in < endin)
7266 {
7267 /* Decode a character */
7268 insize = 1;
7269 do
7270 {
7271 outsize = MultiByteToWideChar(code_page, flags,
7272 in, insize,
7273 buffer, Py_ARRAY_LENGTH(buffer));
7274 if (outsize > 0)
7275 break;
7276 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007277 if (err == ERROR_INVALID_FLAGS && flags) {
7278 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7279 flags = 0;
7280 continue;
7281 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007282 if (err != ERROR_NO_UNICODE_TRANSLATION
7283 && err != ERROR_INSUFFICIENT_BUFFER)
7284 {
7285 PyErr_SetFromWindowsErr(0);
7286 goto error;
7287 }
7288 insize++;
7289 }
7290 /* 4=maximum length of a UTF-8 sequence */
7291 while (insize <= 4 && (in + insize) <= endin);
7292
7293 if (outsize <= 0) {
7294 Py_ssize_t startinpos, endinpos, outpos;
7295
Victor Stinner7d00cc12014-03-17 23:08:06 +01007296 /* last character in partial decode? */
7297 if (in + insize >= endin && !final)
7298 break;
7299
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 startinpos = in - startin;
7301 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007302 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007303 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 errors, &errorHandler,
7305 encoding, reason,
7306 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007307 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 {
7309 goto error;
7310 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007311 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007312 }
7313 else {
7314 in += insize;
7315 memcpy(out, buffer, outsize * sizeof(wchar_t));
7316 out += outsize;
7317 }
7318 }
7319
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007320 /* Shrink the buffer */
7321 assert(out - *buf <= *bufsize);
7322 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007323 /* (in - startin) <= size and size is an int */
7324 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007325
7326error:
7327 Py_XDECREF(encoding_obj);
7328 Py_XDECREF(errorHandler);
7329 Py_XDECREF(exc);
7330 return ret;
7331}
7332
Victor Stinner3a50e702011-10-18 21:21:00 +02007333static PyObject *
7334decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007335 const char *s, Py_ssize_t size,
7336 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007338 wchar_t *buf = NULL;
7339 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007340 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007341
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 if (code_page < 0) {
7343 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7344 return NULL;
7345 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007346 if (size < 0) {
7347 PyErr_BadInternalCall();
7348 return NULL;
7349 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007350
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007351 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007352 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007353
Victor Stinner76a31a62011-11-04 00:05:13 +01007354 do
7355 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007356#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007357 if (size > INT_MAX) {
7358 chunk_size = INT_MAX;
7359 final = 0;
7360 done = 0;
7361 }
7362 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007363#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007364 {
7365 chunk_size = (int)size;
7366 final = (consumed == NULL);
7367 done = 1;
7368 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369
Victor Stinner76a31a62011-11-04 00:05:13 +01007370 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007371 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007372 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007373 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007374 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007376 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007377 s, chunk_size);
7378 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007379 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007380 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007381 errors, final);
7382 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007383
7384 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007385 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007386 return NULL;
7387 }
7388
7389 if (consumed)
7390 *consumed += converted;
7391
7392 s += converted;
7393 size -= converted;
7394 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007395
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007396 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7397 PyMem_Free(buf);
7398 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007399}
7400
Alexander Belopolsky40018472011-02-26 01:02:56 +00007401PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007402PyUnicode_DecodeCodePageStateful(int code_page,
7403 const char *s,
7404 Py_ssize_t size,
7405 const char *errors,
7406 Py_ssize_t *consumed)
7407{
7408 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7409}
7410
7411PyObject *
7412PyUnicode_DecodeMBCSStateful(const char *s,
7413 Py_ssize_t size,
7414 const char *errors,
7415 Py_ssize_t *consumed)
7416{
7417 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7418}
7419
7420PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007421PyUnicode_DecodeMBCS(const char *s,
7422 Py_ssize_t size,
7423 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007424{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007425 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7426}
7427
Victor Stinner3a50e702011-10-18 21:21:00 +02007428static DWORD
7429encode_code_page_flags(UINT code_page, const char *errors)
7430{
7431 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007432 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007433 }
7434 else if (code_page == CP_UTF7) {
7435 /* CP_UTF7 only supports flags=0 */
7436 return 0;
7437 }
7438 else {
7439 if (errors != NULL && strcmp(errors, "replace") == 0)
7440 return 0;
7441 else
7442 return WC_NO_BEST_FIT_CHARS;
7443 }
7444}
7445
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007446/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007447 * Encode a Unicode string to a Windows code page into a byte string in strict
7448 * mode.
7449 *
7450 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007451 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007453static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007454encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007455 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007456 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007457{
Victor Stinner554f3f02010-06-16 23:33:54 +00007458 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 BOOL *pusedDefaultChar = &usedDefaultChar;
7460 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007461 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007462 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007463 const DWORD flags = encode_code_page_flags(code_page, NULL);
7464 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007465 /* Create a substring so that we can get the UTF-16 representation
7466 of just the slice under consideration. */
7467 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007468
Martin v. Löwis3d325192011-11-04 18:23:06 +01007469 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007470
Victor Stinner3a50e702011-10-18 21:21:00 +02007471 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007472 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007473 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007474 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007475
Victor Stinner2fc507f2011-11-04 20:06:39 +01007476 substring = PyUnicode_Substring(unicode, offset, offset+len);
7477 if (substring == NULL)
7478 return -1;
7479 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7480 if (p == NULL) {
7481 Py_DECREF(substring);
7482 return -1;
7483 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007484 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007485
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007486 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007487 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007488 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007489 NULL, 0,
7490 NULL, pusedDefaultChar);
7491 if (outsize <= 0)
7492 goto error;
7493 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007494 if (pusedDefaultChar && *pusedDefaultChar) {
7495 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007496 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007497 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007498
Victor Stinner3a50e702011-10-18 21:21:00 +02007499 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007501 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007502 if (*outbytes == NULL) {
7503 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007504 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007505 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007507 }
7508 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007509 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007510 const Py_ssize_t n = PyBytes_Size(*outbytes);
7511 if (outsize > PY_SSIZE_T_MAX - n) {
7512 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007513 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007514 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007515 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007516 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7517 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007518 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007519 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521 }
7522
7523 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007525 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 out, outsize,
7527 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007528 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007529 if (outsize <= 0)
7530 goto error;
7531 if (pusedDefaultChar && *pusedDefaultChar)
7532 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007533 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007534
Victor Stinner3a50e702011-10-18 21:21:00 +02007535error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007536 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007537 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7538 return -2;
7539 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007540 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007541}
7542
Victor Stinner3a50e702011-10-18 21:21:00 +02007543/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007544 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007545 * error handler.
7546 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007547 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007548 * -1 on other error.
7549 */
7550static int
7551encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007552 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007553 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007554{
Victor Stinner3a50e702011-10-18 21:21:00 +02007555 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007556 Py_ssize_t pos = unicode_offset;
7557 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007558 /* Ideally, we should get reason from FormatMessage. This is the Windows
7559 2000 English version of the message. */
7560 const char *reason = "invalid character";
7561 /* 4=maximum length of a UTF-8 sequence */
7562 char buffer[4];
7563 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7564 Py_ssize_t outsize;
7565 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007566 PyObject *errorHandler = NULL;
7567 PyObject *exc = NULL;
7568 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007569 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007570 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007571 PyObject *rep;
7572 int ret = -1;
7573
7574 assert(insize > 0);
7575
7576 encoding = code_page_name(code_page, &encoding_obj);
7577 if (encoding == NULL)
7578 return -1;
7579
7580 if (errors == NULL || strcmp(errors, "strict") == 0) {
7581 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7582 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007583 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007584 if (exc != NULL) {
7585 PyCodec_StrictErrors(exc);
7586 Py_DECREF(exc);
7587 }
7588 Py_XDECREF(encoding_obj);
7589 return -1;
7590 }
7591
7592 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7593 pusedDefaultChar = &usedDefaultChar;
7594 else
7595 pusedDefaultChar = NULL;
7596
7597 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7598 PyErr_NoMemory();
7599 goto error;
7600 }
7601 outsize = insize * Py_ARRAY_LENGTH(buffer);
7602
7603 if (*outbytes == NULL) {
7604 /* Create string object */
7605 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7606 if (*outbytes == NULL)
7607 goto error;
7608 out = PyBytes_AS_STRING(*outbytes);
7609 }
7610 else {
7611 /* Extend string object */
7612 Py_ssize_t n = PyBytes_Size(*outbytes);
7613 if (n > PY_SSIZE_T_MAX - outsize) {
7614 PyErr_NoMemory();
7615 goto error;
7616 }
7617 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7618 goto error;
7619 out = PyBytes_AS_STRING(*outbytes) + n;
7620 }
7621
7622 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007623 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007624 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007625 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7626 wchar_t chars[2];
7627 int charsize;
7628 if (ch < 0x10000) {
7629 chars[0] = (wchar_t)ch;
7630 charsize = 1;
7631 }
7632 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007633 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7634 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007635 charsize = 2;
7636 }
7637
Victor Stinner3a50e702011-10-18 21:21:00 +02007638 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007639 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 buffer, Py_ARRAY_LENGTH(buffer),
7641 NULL, pusedDefaultChar);
7642 if (outsize > 0) {
7643 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7644 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007645 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 memcpy(out, buffer, outsize);
7647 out += outsize;
7648 continue;
7649 }
7650 }
7651 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7652 PyErr_SetFromWindowsErr(0);
7653 goto error;
7654 }
7655
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 rep = unicode_encode_call_errorhandler(
7657 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007658 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007659 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007660 if (rep == NULL)
7661 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007662 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007663
7664 if (PyBytes_Check(rep)) {
7665 outsize = PyBytes_GET_SIZE(rep);
7666 if (outsize != 1) {
7667 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7668 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7669 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7670 Py_DECREF(rep);
7671 goto error;
7672 }
7673 out = PyBytes_AS_STRING(*outbytes) + offset;
7674 }
7675 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7676 out += outsize;
7677 }
7678 else {
7679 Py_ssize_t i;
7680 enum PyUnicode_Kind kind;
7681 void *data;
7682
Benjamin Petersonbac79492012-01-14 13:34:47 -05007683 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007684 Py_DECREF(rep);
7685 goto error;
7686 }
7687
7688 outsize = PyUnicode_GET_LENGTH(rep);
7689 if (outsize != 1) {
7690 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7691 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7692 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7693 Py_DECREF(rep);
7694 goto error;
7695 }
7696 out = PyBytes_AS_STRING(*outbytes) + offset;
7697 }
7698 kind = PyUnicode_KIND(rep);
7699 data = PyUnicode_DATA(rep);
7700 for (i=0; i < outsize; i++) {
7701 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7702 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007703 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007704 encoding, unicode,
7705 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 "unable to encode error handler result to ASCII");
7707 Py_DECREF(rep);
7708 goto error;
7709 }
7710 *out = (unsigned char)ch;
7711 out++;
7712 }
7713 }
7714 Py_DECREF(rep);
7715 }
7716 /* write a NUL byte */
7717 *out = 0;
7718 outsize = out - PyBytes_AS_STRING(*outbytes);
7719 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7720 if (_PyBytes_Resize(outbytes, outsize) < 0)
7721 goto error;
7722 ret = 0;
7723
7724error:
7725 Py_XDECREF(encoding_obj);
7726 Py_XDECREF(errorHandler);
7727 Py_XDECREF(exc);
7728 return ret;
7729}
7730
Victor Stinner3a50e702011-10-18 21:21:00 +02007731static PyObject *
7732encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007733 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007734 const char *errors)
7735{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007736 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007737 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007738 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007739 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007740
Victor Stinner29dacf22015-01-26 16:41:32 +01007741 if (!PyUnicode_Check(unicode)) {
7742 PyErr_BadArgument();
7743 return NULL;
7744 }
7745
Benjamin Petersonbac79492012-01-14 13:34:47 -05007746 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007747 return NULL;
7748 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007749
Victor Stinner3a50e702011-10-18 21:21:00 +02007750 if (code_page < 0) {
7751 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7752 return NULL;
7753 }
7754
Martin v. Löwis3d325192011-11-04 18:23:06 +01007755 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007756 return PyBytes_FromStringAndSize(NULL, 0);
7757
Victor Stinner7581cef2011-11-03 22:32:33 +01007758 offset = 0;
7759 do
7760 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007761#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007762 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007763 chunks. */
7764 if (len > INT_MAX/2) {
7765 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007766 done = 0;
7767 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007768 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007769#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007770 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007771 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007772 done = 1;
7773 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007774
Victor Stinner76a31a62011-11-04 00:05:13 +01007775 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007776 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007777 errors);
7778 if (ret == -2)
7779 ret = encode_code_page_errors(code_page, &outbytes,
7780 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007781 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007782 if (ret < 0) {
7783 Py_XDECREF(outbytes);
7784 return NULL;
7785 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007786
Victor Stinner7581cef2011-11-03 22:32:33 +01007787 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007788 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007789 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007790
Victor Stinner3a50e702011-10-18 21:21:00 +02007791 return outbytes;
7792}
7793
7794PyObject *
7795PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7796 Py_ssize_t size,
7797 const char *errors)
7798{
Victor Stinner7581cef2011-11-03 22:32:33 +01007799 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007800 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007801 if (unicode == NULL)
7802 return NULL;
7803 res = encode_code_page(CP_ACP, unicode, errors);
7804 Py_DECREF(unicode);
7805 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007806}
7807
7808PyObject *
7809PyUnicode_EncodeCodePage(int code_page,
7810 PyObject *unicode,
7811 const char *errors)
7812{
Victor Stinner7581cef2011-11-03 22:32:33 +01007813 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007814}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007815
Alexander Belopolsky40018472011-02-26 01:02:56 +00007816PyObject *
7817PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007818{
Victor Stinner7581cef2011-11-03 22:32:33 +01007819 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007820}
7821
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007822#undef NEED_RETRY
7823
Steve Dowercc16be82016-09-08 10:35:16 -07007824#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007825
Guido van Rossumd57fd912000-03-10 22:53:23 +00007826/* --- Character Mapping Codec -------------------------------------------- */
7827
Victor Stinnerfb161b12013-04-18 01:44:27 +02007828static int
7829charmap_decode_string(const char *s,
7830 Py_ssize_t size,
7831 PyObject *mapping,
7832 const char *errors,
7833 _PyUnicodeWriter *writer)
7834{
7835 const char *starts = s;
7836 const char *e;
7837 Py_ssize_t startinpos, endinpos;
7838 PyObject *errorHandler = NULL, *exc = NULL;
7839 Py_ssize_t maplen;
7840 enum PyUnicode_Kind mapkind;
7841 void *mapdata;
7842 Py_UCS4 x;
7843 unsigned char ch;
7844
7845 if (PyUnicode_READY(mapping) == -1)
7846 return -1;
7847
7848 maplen = PyUnicode_GET_LENGTH(mapping);
7849 mapdata = PyUnicode_DATA(mapping);
7850 mapkind = PyUnicode_KIND(mapping);
7851
7852 e = s + size;
7853
7854 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7855 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7856 * is disabled in encoding aliases, latin1 is preferred because
7857 * its implementation is faster. */
7858 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7859 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7860 Py_UCS4 maxchar = writer->maxchar;
7861
7862 assert (writer->kind == PyUnicode_1BYTE_KIND);
7863 while (s < e) {
7864 ch = *s;
7865 x = mapdata_ucs1[ch];
7866 if (x > maxchar) {
7867 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7868 goto onError;
7869 maxchar = writer->maxchar;
7870 outdata = (Py_UCS1 *)writer->data;
7871 }
7872 outdata[writer->pos] = x;
7873 writer->pos++;
7874 ++s;
7875 }
7876 return 0;
7877 }
7878
7879 while (s < e) {
7880 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7881 enum PyUnicode_Kind outkind = writer->kind;
7882 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7883 if (outkind == PyUnicode_1BYTE_KIND) {
7884 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7885 Py_UCS4 maxchar = writer->maxchar;
7886 while (s < e) {
7887 ch = *s;
7888 x = mapdata_ucs2[ch];
7889 if (x > maxchar)
7890 goto Error;
7891 outdata[writer->pos] = x;
7892 writer->pos++;
7893 ++s;
7894 }
7895 break;
7896 }
7897 else if (outkind == PyUnicode_2BYTE_KIND) {
7898 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7899 while (s < e) {
7900 ch = *s;
7901 x = mapdata_ucs2[ch];
7902 if (x == 0xFFFE)
7903 goto Error;
7904 outdata[writer->pos] = x;
7905 writer->pos++;
7906 ++s;
7907 }
7908 break;
7909 }
7910 }
7911 ch = *s;
7912
7913 if (ch < maplen)
7914 x = PyUnicode_READ(mapkind, mapdata, ch);
7915 else
7916 x = 0xfffe; /* invalid value */
7917Error:
7918 if (x == 0xfffe)
7919 {
7920 /* undefined mapping */
7921 startinpos = s-starts;
7922 endinpos = startinpos+1;
7923 if (unicode_decode_call_errorhandler_writer(
7924 errors, &errorHandler,
7925 "charmap", "character maps to <undefined>",
7926 &starts, &e, &startinpos, &endinpos, &exc, &s,
7927 writer)) {
7928 goto onError;
7929 }
7930 continue;
7931 }
7932
7933 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7934 goto onError;
7935 ++s;
7936 }
7937 Py_XDECREF(errorHandler);
7938 Py_XDECREF(exc);
7939 return 0;
7940
7941onError:
7942 Py_XDECREF(errorHandler);
7943 Py_XDECREF(exc);
7944 return -1;
7945}
7946
7947static int
7948charmap_decode_mapping(const char *s,
7949 Py_ssize_t size,
7950 PyObject *mapping,
7951 const char *errors,
7952 _PyUnicodeWriter *writer)
7953{
7954 const char *starts = s;
7955 const char *e;
7956 Py_ssize_t startinpos, endinpos;
7957 PyObject *errorHandler = NULL, *exc = NULL;
7958 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007959 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007960
7961 e = s + size;
7962
7963 while (s < e) {
7964 ch = *s;
7965
7966 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7967 key = PyLong_FromLong((long)ch);
7968 if (key == NULL)
7969 goto onError;
7970
7971 item = PyObject_GetItem(mapping, key);
7972 Py_DECREF(key);
7973 if (item == NULL) {
7974 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7975 /* No mapping found means: mapping is undefined. */
7976 PyErr_Clear();
7977 goto Undefined;
7978 } else
7979 goto onError;
7980 }
7981
7982 /* Apply mapping */
7983 if (item == Py_None)
7984 goto Undefined;
7985 if (PyLong_Check(item)) {
7986 long value = PyLong_AS_LONG(item);
7987 if (value == 0xFFFE)
7988 goto Undefined;
7989 if (value < 0 || value > MAX_UNICODE) {
7990 PyErr_Format(PyExc_TypeError,
7991 "character mapping must be in range(0x%lx)",
7992 (unsigned long)MAX_UNICODE + 1);
7993 goto onError;
7994 }
7995
7996 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7997 goto onError;
7998 }
7999 else if (PyUnicode_Check(item)) {
8000 if (PyUnicode_READY(item) == -1)
8001 goto onError;
8002 if (PyUnicode_GET_LENGTH(item) == 1) {
8003 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8004 if (value == 0xFFFE)
8005 goto Undefined;
8006 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8007 goto onError;
8008 }
8009 else {
8010 writer->overallocate = 1;
8011 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8012 goto onError;
8013 }
8014 }
8015 else {
8016 /* wrong return value */
8017 PyErr_SetString(PyExc_TypeError,
8018 "character mapping must return integer, None or str");
8019 goto onError;
8020 }
8021 Py_CLEAR(item);
8022 ++s;
8023 continue;
8024
8025Undefined:
8026 /* undefined mapping */
8027 Py_CLEAR(item);
8028 startinpos = s-starts;
8029 endinpos = startinpos+1;
8030 if (unicode_decode_call_errorhandler_writer(
8031 errors, &errorHandler,
8032 "charmap", "character maps to <undefined>",
8033 &starts, &e, &startinpos, &endinpos, &exc, &s,
8034 writer)) {
8035 goto onError;
8036 }
8037 }
8038 Py_XDECREF(errorHandler);
8039 Py_XDECREF(exc);
8040 return 0;
8041
8042onError:
8043 Py_XDECREF(item);
8044 Py_XDECREF(errorHandler);
8045 Py_XDECREF(exc);
8046 return -1;
8047}
8048
Alexander Belopolsky40018472011-02-26 01:02:56 +00008049PyObject *
8050PyUnicode_DecodeCharmap(const char *s,
8051 Py_ssize_t size,
8052 PyObject *mapping,
8053 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008054{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008055 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008056
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 /* Default to Latin-1 */
8058 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008059 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008060
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008062 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008063 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008064 writer.min_length = size;
8065 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008066 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008067
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008068 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008069 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8070 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008071 }
8072 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008073 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8074 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008075 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008076 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008077
Benjamin Peterson29060642009-01-31 22:14:21 +00008078 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008079 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008080 return NULL;
8081}
8082
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008083/* Charmap encoding: the lookup table */
8084
Alexander Belopolsky40018472011-02-26 01:02:56 +00008085struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008086 PyObject_HEAD
8087 unsigned char level1[32];
8088 int count2, count3;
8089 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008090};
8091
8092static PyObject*
8093encoding_map_size(PyObject *obj, PyObject* args)
8094{
8095 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008096 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008097 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008098}
8099
8100static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008101 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008102 PyDoc_STR("Return the size (in bytes) of this object") },
8103 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008104};
8105
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008107 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008108 "EncodingMap", /*tp_name*/
8109 sizeof(struct encoding_map), /*tp_basicsize*/
8110 0, /*tp_itemsize*/
8111 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008112 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008113 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008114 0, /*tp_getattr*/
8115 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008116 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008117 0, /*tp_repr*/
8118 0, /*tp_as_number*/
8119 0, /*tp_as_sequence*/
8120 0, /*tp_as_mapping*/
8121 0, /*tp_hash*/
8122 0, /*tp_call*/
8123 0, /*tp_str*/
8124 0, /*tp_getattro*/
8125 0, /*tp_setattro*/
8126 0, /*tp_as_buffer*/
8127 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8128 0, /*tp_doc*/
8129 0, /*tp_traverse*/
8130 0, /*tp_clear*/
8131 0, /*tp_richcompare*/
8132 0, /*tp_weaklistoffset*/
8133 0, /*tp_iter*/
8134 0, /*tp_iternext*/
8135 encoding_map_methods, /*tp_methods*/
8136 0, /*tp_members*/
8137 0, /*tp_getset*/
8138 0, /*tp_base*/
8139 0, /*tp_dict*/
8140 0, /*tp_descr_get*/
8141 0, /*tp_descr_set*/
8142 0, /*tp_dictoffset*/
8143 0, /*tp_init*/
8144 0, /*tp_alloc*/
8145 0, /*tp_new*/
8146 0, /*tp_free*/
8147 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008148};
8149
8150PyObject*
8151PyUnicode_BuildEncodingMap(PyObject* string)
8152{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008153 PyObject *result;
8154 struct encoding_map *mresult;
8155 int i;
8156 int need_dict = 0;
8157 unsigned char level1[32];
8158 unsigned char level2[512];
8159 unsigned char *mlevel1, *mlevel2, *mlevel3;
8160 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008161 int kind;
8162 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008163 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008164 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008165
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008166 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167 PyErr_BadArgument();
8168 return NULL;
8169 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008170 kind = PyUnicode_KIND(string);
8171 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008172 length = PyUnicode_GET_LENGTH(string);
8173 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008174 memset(level1, 0xFF, sizeof level1);
8175 memset(level2, 0xFF, sizeof level2);
8176
8177 /* If there isn't a one-to-one mapping of NULL to \0,
8178 or if there are non-BMP characters, we need to use
8179 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008180 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008181 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008182 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008183 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008184 ch = PyUnicode_READ(kind, data, i);
8185 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008186 need_dict = 1;
8187 break;
8188 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008189 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008190 /* unmapped character */
8191 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008192 l1 = ch >> 11;
8193 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008194 if (level1[l1] == 0xFF)
8195 level1[l1] = count2++;
8196 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008197 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008198 }
8199
8200 if (count2 >= 0xFF || count3 >= 0xFF)
8201 need_dict = 1;
8202
8203 if (need_dict) {
8204 PyObject *result = PyDict_New();
8205 PyObject *key, *value;
8206 if (!result)
8207 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008208 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008209 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008210 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008211 if (!key || !value)
8212 goto failed1;
8213 if (PyDict_SetItem(result, key, value) == -1)
8214 goto failed1;
8215 Py_DECREF(key);
8216 Py_DECREF(value);
8217 }
8218 return result;
8219 failed1:
8220 Py_XDECREF(key);
8221 Py_XDECREF(value);
8222 Py_DECREF(result);
8223 return NULL;
8224 }
8225
8226 /* Create a three-level trie */
8227 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8228 16*count2 + 128*count3 - 1);
8229 if (!result)
8230 return PyErr_NoMemory();
8231 PyObject_Init(result, &EncodingMapType);
8232 mresult = (struct encoding_map*)result;
8233 mresult->count2 = count2;
8234 mresult->count3 = count3;
8235 mlevel1 = mresult->level1;
8236 mlevel2 = mresult->level23;
8237 mlevel3 = mresult->level23 + 16*count2;
8238 memcpy(mlevel1, level1, 32);
8239 memset(mlevel2, 0xFF, 16*count2);
8240 memset(mlevel3, 0, 128*count3);
8241 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008242 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008243 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008244 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8245 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008246 /* unmapped character */
8247 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008248 o1 = ch>>11;
8249 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250 i2 = 16*mlevel1[o1] + o2;
8251 if (mlevel2[i2] == 0xFF)
8252 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008253 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008254 i3 = 128*mlevel2[i2] + o3;
8255 mlevel3[i3] = i;
8256 }
8257 return result;
8258}
8259
8260static int
Victor Stinner22168992011-11-20 17:09:18 +01008261encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008262{
8263 struct encoding_map *map = (struct encoding_map*)mapping;
8264 int l1 = c>>11;
8265 int l2 = (c>>7) & 0xF;
8266 int l3 = c & 0x7F;
8267 int i;
8268
Victor Stinner22168992011-11-20 17:09:18 +01008269 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008270 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008271 if (c == 0)
8272 return 0;
8273 /* level 1*/
8274 i = map->level1[l1];
8275 if (i == 0xFF) {
8276 return -1;
8277 }
8278 /* level 2*/
8279 i = map->level23[16*i+l2];
8280 if (i == 0xFF) {
8281 return -1;
8282 }
8283 /* level 3 */
8284 i = map->level23[16*map->count2 + 128*i + l3];
8285 if (i == 0) {
8286 return -1;
8287 }
8288 return i;
8289}
8290
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008291/* Lookup the character ch in the mapping. If the character
8292 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008293 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008294static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008295charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008296{
Christian Heimes217cfd12007-12-02 14:31:20 +00008297 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008298 PyObject *x;
8299
8300 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008301 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 x = PyObject_GetItem(mapping, w);
8303 Py_DECREF(w);
8304 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8306 /* No mapping found means: mapping is undefined. */
8307 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008308 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 } else
8310 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008311 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008312 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008314 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 long value = PyLong_AS_LONG(x);
8316 if (value < 0 || value > 255) {
8317 PyErr_SetString(PyExc_TypeError,
8318 "character mapping must be in range(256)");
8319 Py_DECREF(x);
8320 return NULL;
8321 }
8322 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008323 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008324 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008325 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008326 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 /* wrong return value */
8328 PyErr_Format(PyExc_TypeError,
8329 "character mapping must return integer, bytes or None, not %.400s",
8330 x->ob_type->tp_name);
8331 Py_DECREF(x);
8332 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008333 }
8334}
8335
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008336static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008337charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008338{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008339 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8340 /* exponentially overallocate to minimize reallocations */
8341 if (requiredsize < 2*outsize)
8342 requiredsize = 2*outsize;
8343 if (_PyBytes_Resize(outobj, requiredsize))
8344 return -1;
8345 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008346}
8347
Benjamin Peterson14339b62009-01-31 16:36:08 +00008348typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008349 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008350} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008351/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008352 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353 space is available. Return a new reference to the object that
8354 was put in the output buffer, or Py_None, if the mapping was undefined
8355 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008356 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008357static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008358charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008360{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008361 PyObject *rep;
8362 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008363 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364
Christian Heimes90aa7642007-12-19 02:45:37 +00008365 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008366 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008367 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 if (res == -1)
8369 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008370 if (outsize<requiredsize)
8371 if (charmapencode_resize(outobj, outpos, requiredsize))
8372 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008373 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 outstart[(*outpos)++] = (char)res;
8375 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008376 }
8377
8378 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008379 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008380 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008381 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 Py_DECREF(rep);
8383 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008384 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008385 if (PyLong_Check(rep)) {
8386 Py_ssize_t requiredsize = *outpos+1;
8387 if (outsize<requiredsize)
8388 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8389 Py_DECREF(rep);
8390 return enc_EXCEPTION;
8391 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008392 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008393 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008394 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 else {
8396 const char *repchars = PyBytes_AS_STRING(rep);
8397 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8398 Py_ssize_t requiredsize = *outpos+repsize;
8399 if (outsize<requiredsize)
8400 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8401 Py_DECREF(rep);
8402 return enc_EXCEPTION;
8403 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008404 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008405 memcpy(outstart + *outpos, repchars, repsize);
8406 *outpos += repsize;
8407 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008408 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008409 Py_DECREF(rep);
8410 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411}
8412
8413/* handle an error in PyUnicode_EncodeCharmap
8414 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008415static int
8416charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008417 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008419 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008420 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008421{
8422 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008423 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008424 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008425 enum PyUnicode_Kind kind;
8426 void *data;
8427 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008428 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008429 Py_ssize_t collstartpos = *inpos;
8430 Py_ssize_t collendpos = *inpos+1;
8431 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008432 const char *encoding = "charmap";
8433 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008434 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008435 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008436 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008437
Benjamin Petersonbac79492012-01-14 13:34:47 -05008438 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008439 return -1;
8440 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441 /* find all unencodable characters */
8442 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008443 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008444 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008445 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008446 val = encoding_map_lookup(ch, mapping);
8447 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008448 break;
8449 ++collendpos;
8450 continue;
8451 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008452
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008453 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8454 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008455 if (rep==NULL)
8456 return -1;
8457 else if (rep!=Py_None) {
8458 Py_DECREF(rep);
8459 break;
8460 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008461 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008462 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008463 }
8464 /* cache callback name lookup
8465 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008466 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008467 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008468
8469 switch (*error_handler) {
8470 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008471 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008472 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008473
8474 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008475 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008476 x = charmapencode_output('?', mapping, res, respos);
8477 if (x==enc_EXCEPTION) {
8478 return -1;
8479 }
8480 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008481 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008482 return -1;
8483 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008484 }
8485 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008486 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008487 *inpos = collendpos;
8488 break;
Victor Stinner50149202015-09-22 00:26:54 +02008489
8490 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008491 /* generate replacement (temporarily (mis)uses p) */
8492 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008493 char buffer[2+29+1+1];
8494 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008495 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008496 for (cp = buffer; *cp; ++cp) {
8497 x = charmapencode_output(*cp, mapping, res, respos);
8498 if (x==enc_EXCEPTION)
8499 return -1;
8500 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008501 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 return -1;
8503 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008504 }
8505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 *inpos = collendpos;
8507 break;
Victor Stinner50149202015-09-22 00:26:54 +02008508
Benjamin Peterson14339b62009-01-31 16:36:08 +00008509 default:
Victor Stinner50149202015-09-22 00:26:54 +02008510 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008511 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008512 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008515 if (PyBytes_Check(repunicode)) {
8516 /* Directly copy bytes result to output. */
8517 Py_ssize_t outsize = PyBytes_Size(*res);
8518 Py_ssize_t requiredsize;
8519 repsize = PyBytes_Size(repunicode);
8520 requiredsize = *respos + repsize;
8521 if (requiredsize > outsize)
8522 /* Make room for all additional bytes. */
8523 if (charmapencode_resize(res, respos, requiredsize)) {
8524 Py_DECREF(repunicode);
8525 return -1;
8526 }
8527 memcpy(PyBytes_AsString(*res) + *respos,
8528 PyBytes_AsString(repunicode), repsize);
8529 *respos += repsize;
8530 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008531 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008532 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008533 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008534 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008535 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008536 Py_DECREF(repunicode);
8537 return -1;
8538 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008539 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008540 data = PyUnicode_DATA(repunicode);
8541 kind = PyUnicode_KIND(repunicode);
8542 for (index = 0; index < repsize; index++) {
8543 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8544 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008545 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008546 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 return -1;
8548 }
8549 else if (x==enc_FAILED) {
8550 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008551 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008552 return -1;
8553 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008554 }
8555 *inpos = newpos;
8556 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557 }
8558 return 0;
8559}
8560
Alexander Belopolsky40018472011-02-26 01:02:56 +00008561PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008562_PyUnicode_EncodeCharmap(PyObject *unicode,
8563 PyObject *mapping,
8564 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008565{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008566 /* output object */
8567 PyObject *res = NULL;
8568 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008569 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008570 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008571 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008572 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008573 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008574 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008575 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008576 void *data;
8577 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008578
Benjamin Petersonbac79492012-01-14 13:34:47 -05008579 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008580 return NULL;
8581 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008582 data = PyUnicode_DATA(unicode);
8583 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008584
Guido van Rossumd57fd912000-03-10 22:53:23 +00008585 /* Default to Latin-1 */
8586 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008587 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008588
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008589 /* allocate enough for a simple encoding without
8590 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008591 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008592 if (res == NULL)
8593 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008594 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008595 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008596
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008597 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008598 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008600 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 if (x==enc_EXCEPTION) /* error */
8602 goto onError;
8603 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008604 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008606 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 &res, &respos)) {
8608 goto onError;
8609 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008610 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 else
8612 /* done with this character => adjust input position */
8613 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008614 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008615
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008616 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008617 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008618 if (_PyBytes_Resize(&res, respos) < 0)
8619 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008620
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008621 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008622 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 return res;
8624
Benjamin Peterson29060642009-01-31 22:14:21 +00008625 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008626 Py_XDECREF(res);
8627 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008628 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008629 return NULL;
8630}
8631
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008632/* Deprecated */
8633PyObject *
8634PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8635 Py_ssize_t size,
8636 PyObject *mapping,
8637 const char *errors)
8638{
8639 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008640 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008641 if (unicode == NULL)
8642 return NULL;
8643 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8644 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008645 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008646}
8647
Alexander Belopolsky40018472011-02-26 01:02:56 +00008648PyObject *
8649PyUnicode_AsCharmapString(PyObject *unicode,
8650 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008651{
8652 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008653 PyErr_BadArgument();
8654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008656 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657}
8658
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008659/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008660static void
8661make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008662 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008663 Py_ssize_t startpos, Py_ssize_t endpos,
8664 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008665{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008666 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008667 *exceptionObject = _PyUnicodeTranslateError_Create(
8668 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669 }
8670 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008671 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8672 goto onError;
8673 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8674 goto onError;
8675 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8676 goto onError;
8677 return;
8678 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008679 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008680 }
8681}
8682
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008683/* error handling callback helper:
8684 build arguments, call the callback and check the arguments,
8685 put the result into newpos and return the replacement string, which
8686 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008687static PyObject *
8688unicode_translate_call_errorhandler(const char *errors,
8689 PyObject **errorHandler,
8690 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008691 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008692 Py_ssize_t startpos, Py_ssize_t endpos,
8693 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008695 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008697 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698 PyObject *restuple;
8699 PyObject *resunicode;
8700
8701 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008702 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008703 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 }
8706
8707 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008708 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008710 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008712 restuple = PyObject_CallFunctionObjArgs(
8713 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008714 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008717 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008718 Py_DECREF(restuple);
8719 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008720 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008721 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 &resunicode, &i_newpos)) {
8723 Py_DECREF(restuple);
8724 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008725 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008726 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008727 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008728 else
8729 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008730 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008731 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008732 Py_DECREF(restuple);
8733 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008734 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008735 Py_INCREF(resunicode);
8736 Py_DECREF(restuple);
8737 return resunicode;
8738}
8739
8740/* Lookup the character ch in the mapping and put the result in result,
8741 which must be decrefed by the caller.
8742 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008743static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008744charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008745{
Christian Heimes217cfd12007-12-02 14:31:20 +00008746 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747 PyObject *x;
8748
8749 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008750 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008751 x = PyObject_GetItem(mapping, w);
8752 Py_DECREF(w);
8753 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8755 /* No mapping found means: use 1:1 mapping. */
8756 PyErr_Clear();
8757 *result = NULL;
8758 return 0;
8759 } else
8760 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008761 }
8762 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008763 *result = x;
8764 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008766 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008768 if (value < 0 || value > MAX_UNICODE) {
8769 PyErr_Format(PyExc_ValueError,
8770 "character mapping must be in range(0x%x)",
8771 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008772 Py_DECREF(x);
8773 return -1;
8774 }
8775 *result = x;
8776 return 0;
8777 }
8778 else if (PyUnicode_Check(x)) {
8779 *result = x;
8780 return 0;
8781 }
8782 else {
8783 /* wrong return value */
8784 PyErr_SetString(PyExc_TypeError,
8785 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008786 Py_DECREF(x);
8787 return -1;
8788 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008789}
Victor Stinner1194ea02014-04-04 19:37:40 +02008790
8791/* lookup the character, write the result into the writer.
8792 Return 1 if the result was written into the writer, return 0 if the mapping
8793 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008794static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008795charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8796 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008797{
Victor Stinner1194ea02014-04-04 19:37:40 +02008798 PyObject *item;
8799
8800 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008801 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008802
8803 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008804 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008805 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008806 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008807 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008808 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008809 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008810
8811 if (item == Py_None) {
8812 Py_DECREF(item);
8813 return 0;
8814 }
8815
8816 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008817 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8818 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8819 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008820 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8821 Py_DECREF(item);
8822 return -1;
8823 }
8824 Py_DECREF(item);
8825 return 1;
8826 }
8827
8828 if (!PyUnicode_Check(item)) {
8829 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008831 }
8832
8833 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8834 Py_DECREF(item);
8835 return -1;
8836 }
8837
8838 Py_DECREF(item);
8839 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008840}
8841
Victor Stinner89a76ab2014-04-05 11:44:04 +02008842static int
8843unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8844 Py_UCS1 *translate)
8845{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008846 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008847 int ret = 0;
8848
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849 if (charmaptranslate_lookup(ch, mapping, &item)) {
8850 return -1;
8851 }
8852
8853 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008854 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008855 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008856 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008857 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 /* not found => default to 1:1 mapping */
8859 translate[ch] = ch;
8860 return 1;
8861 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008862 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008863 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008864 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8865 used it */
8866 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008867 /* invalid character or character outside ASCII:
8868 skip the fast translate */
8869 goto exit;
8870 }
8871 translate[ch] = (Py_UCS1)replace;
8872 }
8873 else if (PyUnicode_Check(item)) {
8874 Py_UCS4 replace;
8875
8876 if (PyUnicode_READY(item) == -1) {
8877 Py_DECREF(item);
8878 return -1;
8879 }
8880 if (PyUnicode_GET_LENGTH(item) != 1)
8881 goto exit;
8882
8883 replace = PyUnicode_READ_CHAR(item, 0);
8884 if (replace > 127)
8885 goto exit;
8886 translate[ch] = (Py_UCS1)replace;
8887 }
8888 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008889 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008890 goto exit;
8891 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 ret = 1;
8893
Benjamin Peterson1365de72014-04-07 20:15:41 -04008894 exit:
8895 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896 return ret;
8897}
8898
8899/* Fast path for ascii => ascii translation. Return 1 if the whole string
8900 was translated into writer, return 0 if the input string was partially
8901 translated into writer, raise an exception and return -1 on error. */
8902static int
8903unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008904 _PyUnicodeWriter *writer, int ignore,
8905 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008906{
Victor Stinner872b2912014-04-05 14:27:07 +02008907 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908 Py_ssize_t len;
8909 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008910 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008911
Victor Stinner89a76ab2014-04-05 11:44:04 +02008912 len = PyUnicode_GET_LENGTH(input);
8913
Victor Stinner872b2912014-04-05 14:27:07 +02008914 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915
8916 in = PyUnicode_1BYTE_DATA(input);
8917 end = in + len;
8918
8919 assert(PyUnicode_IS_ASCII(writer->buffer));
8920 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8921 out = PyUnicode_1BYTE_DATA(writer->buffer);
8922
Victor Stinner872b2912014-04-05 14:27:07 +02008923 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008924 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008925 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008927 int translate = unicode_fast_translate_lookup(mapping, ch,
8928 ascii_table);
8929 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008930 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008931 if (translate == 0)
8932 goto exit;
8933 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 }
Victor Stinner872b2912014-04-05 14:27:07 +02008935 if (ch2 == 0xfe) {
8936 if (ignore)
8937 continue;
8938 goto exit;
8939 }
8940 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008941 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008942 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008943 }
Victor Stinner872b2912014-04-05 14:27:07 +02008944 res = 1;
8945
8946exit:
8947 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008948 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008949 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008950}
8951
Victor Stinner3222da22015-10-01 22:07:32 +02008952static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008953_PyUnicode_TranslateCharmap(PyObject *input,
8954 PyObject *mapping,
8955 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008956{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008958 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 Py_ssize_t size, i;
8960 int kind;
8961 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008962 _PyUnicodeWriter writer;
8963 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008964 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008965 PyObject *errorHandler = NULL;
8966 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008967 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008968 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008969
Guido van Rossumd57fd912000-03-10 22:53:23 +00008970 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008971 PyErr_BadArgument();
8972 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008973 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008974
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008975 if (PyUnicode_READY(input) == -1)
8976 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008977 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008978 kind = PyUnicode_KIND(input);
8979 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008981 if (size == 0)
8982 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008983
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008984 /* allocate enough for a simple 1:1 translation without
8985 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008986 _PyUnicodeWriter_Init(&writer);
8987 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008988 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008989
Victor Stinner872b2912014-04-05 14:27:07 +02008990 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8991
Victor Stinner33798672016-03-01 21:59:58 +01008992 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008993 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008994 if (PyUnicode_IS_ASCII(input)) {
8995 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8996 if (res < 0) {
8997 _PyUnicodeWriter_Dealloc(&writer);
8998 return NULL;
8999 }
9000 if (res == 1)
9001 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009002 }
Victor Stinner33798672016-03-01 21:59:58 +01009003 else {
9004 i = 0;
9005 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009006
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009007 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009008 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009009 int translate;
9010 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9011 Py_ssize_t newpos;
9012 /* startpos for collecting untranslatable chars */
9013 Py_ssize_t collstart;
9014 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009015 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009016
Victor Stinner1194ea02014-04-04 19:37:40 +02009017 ch = PyUnicode_READ(kind, data, i);
9018 translate = charmaptranslate_output(ch, mapping, &writer);
9019 if (translate < 0)
9020 goto onError;
9021
9022 if (translate != 0) {
9023 /* it worked => adjust input pointer */
9024 ++i;
9025 continue;
9026 }
9027
9028 /* untranslatable character */
9029 collstart = i;
9030 collend = i+1;
9031
9032 /* find all untranslatable characters */
9033 while (collend < size) {
9034 PyObject *x;
9035 ch = PyUnicode_READ(kind, data, collend);
9036 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009037 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009038 Py_XDECREF(x);
9039 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009040 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009041 ++collend;
9042 }
9043
9044 if (ignore) {
9045 i = collend;
9046 }
9047 else {
9048 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9049 reason, input, &exc,
9050 collstart, collend, &newpos);
9051 if (repunicode == NULL)
9052 goto onError;
9053 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009054 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009055 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009056 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009057 Py_DECREF(repunicode);
9058 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009059 }
9060 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009061 Py_XDECREF(exc);
9062 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009063 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009064
Benjamin Peterson29060642009-01-31 22:14:21 +00009065 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009066 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009067 Py_XDECREF(exc);
9068 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009069 return NULL;
9070}
9071
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072/* Deprecated. Use PyUnicode_Translate instead. */
9073PyObject *
9074PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9075 Py_ssize_t size,
9076 PyObject *mapping,
9077 const char *errors)
9078{
Christian Heimes5f520f42012-09-11 14:03:25 +02009079 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009080 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009081 if (!unicode)
9082 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009083 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9084 Py_DECREF(unicode);
9085 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009086}
9087
Alexander Belopolsky40018472011-02-26 01:02:56 +00009088PyObject *
9089PyUnicode_Translate(PyObject *str,
9090 PyObject *mapping,
9091 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009093 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009094 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009095 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096}
Tim Petersced69f82003-09-16 20:30:58 +00009097
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009098PyObject *
9099_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9100{
9101 if (!PyUnicode_Check(unicode)) {
9102 PyErr_BadInternalCall();
9103 return NULL;
9104 }
9105 if (PyUnicode_READY(unicode) == -1)
9106 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009107 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009108 /* If the string is already ASCII, just return the same string */
9109 Py_INCREF(unicode);
9110 return unicode;
9111 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009112
9113 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9114 PyObject *result = PyUnicode_New(len, 127);
9115 if (result == NULL) {
9116 return NULL;
9117 }
9118
9119 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9120 int kind = PyUnicode_KIND(unicode);
9121 const void *data = PyUnicode_DATA(unicode);
9122 Py_ssize_t i;
9123 for (i = 0; i < len; ++i) {
9124 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9125 if (ch < 127) {
9126 out[i] = ch;
9127 }
9128 else if (Py_UNICODE_ISSPACE(ch)) {
9129 out[i] = ' ';
9130 }
9131 else {
9132 int decimal = Py_UNICODE_TODECIMAL(ch);
9133 if (decimal < 0) {
9134 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009135 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009136 _PyUnicode_LENGTH(result) = i + 1;
9137 break;
9138 }
9139 out[i] = '0' + decimal;
9140 }
9141 }
9142
INADA Naoki16dfca42018-07-14 12:06:43 +09009143 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009144 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009145}
9146
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009147PyObject *
9148PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9149 Py_ssize_t length)
9150{
Victor Stinnerf0124502011-11-21 23:12:56 +01009151 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009152 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009153 Py_UCS4 maxchar;
9154 enum PyUnicode_Kind kind;
9155 void *data;
9156
Victor Stinner99d7ad02012-02-22 13:37:39 +01009157 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009158 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009159 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009160 if (ch > 127) {
9161 int decimal = Py_UNICODE_TODECIMAL(ch);
9162 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009163 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009164 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009165 }
9166 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009167
9168 /* Copy to a new string */
9169 decimal = PyUnicode_New(length, maxchar);
9170 if (decimal == NULL)
9171 return decimal;
9172 kind = PyUnicode_KIND(decimal);
9173 data = PyUnicode_DATA(decimal);
9174 /* Iterate over code points */
9175 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009176 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009177 if (ch > 127) {
9178 int decimal = Py_UNICODE_TODECIMAL(ch);
9179 if (decimal >= 0)
9180 ch = '0' + decimal;
9181 }
9182 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009183 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009184 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009185}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009186/* --- Decimal Encoder ---------------------------------------------------- */
9187
Alexander Belopolsky40018472011-02-26 01:02:56 +00009188int
9189PyUnicode_EncodeDecimal(Py_UNICODE *s,
9190 Py_ssize_t length,
9191 char *output,
9192 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009193{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009194 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009195 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009196 enum PyUnicode_Kind kind;
9197 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009198
9199 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009200 PyErr_BadArgument();
9201 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009202 }
9203
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009204 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009205 if (unicode == NULL)
9206 return -1;
9207
Victor Stinner42bf7752011-11-21 22:52:58 +01009208 kind = PyUnicode_KIND(unicode);
9209 data = PyUnicode_DATA(unicode);
9210
Victor Stinnerb84d7232011-11-22 01:50:07 +01009211 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009212 PyObject *exc;
9213 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009214 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009215 Py_ssize_t startpos;
9216
9217 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009218
Benjamin Peterson29060642009-01-31 22:14:21 +00009219 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009220 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009221 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009222 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009223 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 decimal = Py_UNICODE_TODECIMAL(ch);
9225 if (decimal >= 0) {
9226 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009227 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 continue;
9229 }
9230 if (0 < ch && ch < 256) {
9231 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009232 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009233 continue;
9234 }
Victor Stinner6345be92011-11-25 20:09:01 +01009235
Victor Stinner42bf7752011-11-21 22:52:58 +01009236 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009237 exc = NULL;
9238 raise_encode_exception(&exc, "decimal", unicode,
9239 startpos, startpos+1,
9240 "invalid decimal Unicode string");
9241 Py_XDECREF(exc);
9242 Py_DECREF(unicode);
9243 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009244 }
9245 /* 0-terminate the output string */
9246 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009247 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009248 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009249}
9250
Guido van Rossumd57fd912000-03-10 22:53:23 +00009251/* --- Helpers ------------------------------------------------------------ */
9252
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009253/* helper macro to fixup start/end slice values */
9254#define ADJUST_INDICES(start, end, len) \
9255 if (end > len) \
9256 end = len; \
9257 else if (end < 0) { \
9258 end += len; \
9259 if (end < 0) \
9260 end = 0; \
9261 } \
9262 if (start < 0) { \
9263 start += len; \
9264 if (start < 0) \
9265 start = 0; \
9266 }
9267
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009268static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009269any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009271 Py_ssize_t end,
9272 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009273{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009274 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275 void *buf1, *buf2;
9276 Py_ssize_t len1, len2, result;
9277
9278 kind1 = PyUnicode_KIND(s1);
9279 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009280 if (kind1 < kind2)
9281 return -1;
9282
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009283 len1 = PyUnicode_GET_LENGTH(s1);
9284 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009285 ADJUST_INDICES(start, end, len1);
9286 if (end - start < len2)
9287 return -1;
9288
9289 buf1 = PyUnicode_DATA(s1);
9290 buf2 = PyUnicode_DATA(s2);
9291 if (len2 == 1) {
9292 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9293 result = findchar((const char *)buf1 + kind1*start,
9294 kind1, end - start, ch, direction);
9295 if (result == -1)
9296 return -1;
9297 else
9298 return start + result;
9299 }
9300
9301 if (kind2 != kind1) {
9302 buf2 = _PyUnicode_AsKind(s2, kind1);
9303 if (!buf2)
9304 return -2;
9305 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009306
Victor Stinner794d5672011-10-10 03:21:36 +02009307 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009308 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009309 case PyUnicode_1BYTE_KIND:
9310 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9311 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9312 else
9313 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9314 break;
9315 case PyUnicode_2BYTE_KIND:
9316 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9317 break;
9318 case PyUnicode_4BYTE_KIND:
9319 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9320 break;
9321 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009322 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009323 }
9324 }
9325 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009326 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009327 case PyUnicode_1BYTE_KIND:
9328 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9329 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9330 else
9331 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9332 break;
9333 case PyUnicode_2BYTE_KIND:
9334 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9335 break;
9336 case PyUnicode_4BYTE_KIND:
9337 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9338 break;
9339 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009340 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009341 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009342 }
9343
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009344 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009345 PyMem_Free(buf2);
9346
9347 return result;
9348}
9349
Victor Stinner59423e32018-11-26 13:40:01 +01009350/* _PyUnicode_InsertThousandsGrouping() helper functions */
9351#include "stringlib/localeutil.h"
9352
9353/**
9354 * InsertThousandsGrouping:
9355 * @writer: Unicode writer.
9356 * @n_buffer: Number of characters in @buffer.
9357 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9358 * @d_pos: Start of digits string.
9359 * @n_digits: The number of digits in the string, in which we want
9360 * to put the grouping chars.
9361 * @min_width: The minimum width of the digits in the output string.
9362 * Output will be zero-padded on the left to fill.
9363 * @grouping: see definition in localeconv().
9364 * @thousands_sep: see definition in localeconv().
9365 *
9366 * There are 2 modes: counting and filling. If @writer is NULL,
9367 * we are in counting mode, else filling mode.
9368 * If counting, the required buffer size is returned.
9369 * If filling, we know the buffer will be large enough, so we don't
9370 * need to pass in the buffer size.
9371 * Inserts thousand grouping characters (as defined by grouping and
9372 * thousands_sep) into @writer.
9373 *
9374 * Return value: -1 on error, number of characters otherwise.
9375 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009376Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009377_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009378 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009379 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009380 PyObject *digits,
9381 Py_ssize_t d_pos,
9382 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009384 const char *grouping,
9385 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009386 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387{
Xtreak3f7983a2019-01-07 20:39:14 +05309388 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009389 if (writer) {
9390 assert(digits != NULL);
9391 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009392 }
9393 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009394 assert(digits == NULL);
9395 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009396 }
Victor Stinner59423e32018-11-26 13:40:01 +01009397 assert(0 <= d_pos);
9398 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009399 assert(grouping != NULL);
9400
9401 if (digits != NULL) {
9402 if (PyUnicode_READY(digits) == -1) {
9403 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009404 }
Victor Stinner59423e32018-11-26 13:40:01 +01009405 }
9406 if (PyUnicode_READY(thousands_sep) == -1) {
9407 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009408 }
9409
Victor Stinner59423e32018-11-26 13:40:01 +01009410 Py_ssize_t count = 0;
9411 Py_ssize_t n_zeros;
9412 int loop_broken = 0;
9413 int use_separator = 0; /* First time through, don't append the
9414 separator. They only go between
9415 groups. */
9416 Py_ssize_t buffer_pos;
9417 Py_ssize_t digits_pos;
9418 Py_ssize_t len;
9419 Py_ssize_t n_chars;
9420 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9421 be looked at */
9422 /* A generator that returns all of the grouping widths, until it
9423 returns 0. */
9424 GroupGenerator groupgen;
9425 GroupGenerator_init(&groupgen, grouping);
9426 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9427
9428 /* if digits are not grouped, thousands separator
9429 should be an empty string */
9430 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9431
9432 digits_pos = d_pos + n_digits;
9433 if (writer) {
9434 buffer_pos = writer->pos + n_buffer;
9435 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9436 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009437 }
Victor Stinner59423e32018-11-26 13:40:01 +01009438 else {
9439 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009440 }
Victor Stinner59423e32018-11-26 13:40:01 +01009441
9442 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009443 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009444 }
Victor Stinner59423e32018-11-26 13:40:01 +01009445
9446 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9447 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9448 n_zeros = Py_MAX(0, len - remaining);
9449 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9450
9451 /* Use n_zero zero's and n_chars chars */
9452
9453 /* Count only, don't do anything. */
9454 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9455
9456 /* Copy into the writer. */
9457 InsertThousandsGrouping_fill(writer, &buffer_pos,
9458 digits, &digits_pos,
9459 n_chars, n_zeros,
9460 use_separator ? thousands_sep : NULL,
9461 thousands_sep_len, maxchar);
9462
9463 /* Use a separator next time. */
9464 use_separator = 1;
9465
9466 remaining -= n_chars;
9467 min_width -= len;
9468
9469 if (remaining <= 0 && min_width <= 0) {
9470 loop_broken = 1;
9471 break;
9472 }
9473 min_width -= thousands_sep_len;
9474 }
9475 if (!loop_broken) {
9476 /* We left the loop without using a break statement. */
9477
9478 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9479 n_zeros = Py_MAX(0, len - remaining);
9480 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9481
9482 /* Use n_zero zero's and n_chars chars */
9483 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9484
9485 /* Copy into the writer. */
9486 InsertThousandsGrouping_fill(writer, &buffer_pos,
9487 digits, &digits_pos,
9488 n_chars, n_zeros,
9489 use_separator ? thousands_sep : NULL,
9490 thousands_sep_len, maxchar);
9491 }
9492 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009493}
9494
9495
Alexander Belopolsky40018472011-02-26 01:02:56 +00009496Py_ssize_t
9497PyUnicode_Count(PyObject *str,
9498 PyObject *substr,
9499 Py_ssize_t start,
9500 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009501{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009502 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009503 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009504 void *buf1 = NULL, *buf2 = NULL;
9505 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009506
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009507 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009508 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009509
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009510 kind1 = PyUnicode_KIND(str);
9511 kind2 = PyUnicode_KIND(substr);
9512 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009513 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009514
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009515 len1 = PyUnicode_GET_LENGTH(str);
9516 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009517 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009518 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009519 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009520
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009521 buf1 = PyUnicode_DATA(str);
9522 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009523 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009524 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009525 if (!buf2)
9526 goto onError;
9527 }
9528
9529 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009530 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009531 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009532 result = asciilib_count(
9533 ((Py_UCS1*)buf1) + start, end - start,
9534 buf2, len2, PY_SSIZE_T_MAX
9535 );
9536 else
9537 result = ucs1lib_count(
9538 ((Py_UCS1*)buf1) + start, end - start,
9539 buf2, len2, PY_SSIZE_T_MAX
9540 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009541 break;
9542 case PyUnicode_2BYTE_KIND:
9543 result = ucs2lib_count(
9544 ((Py_UCS2*)buf1) + start, end - start,
9545 buf2, len2, PY_SSIZE_T_MAX
9546 );
9547 break;
9548 case PyUnicode_4BYTE_KIND:
9549 result = ucs4lib_count(
9550 ((Py_UCS4*)buf1) + start, end - start,
9551 buf2, len2, PY_SSIZE_T_MAX
9552 );
9553 break;
9554 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009555 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009557
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009558 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009559 PyMem_Free(buf2);
9560
Guido van Rossumd57fd912000-03-10 22:53:23 +00009561 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009562 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009563 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 PyMem_Free(buf2);
9565 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009566}
9567
Alexander Belopolsky40018472011-02-26 01:02:56 +00009568Py_ssize_t
9569PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009570 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009571 Py_ssize_t start,
9572 Py_ssize_t end,
9573 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009574{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009575 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009576 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009577
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009578 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009579}
9580
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009581Py_ssize_t
9582PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9583 Py_ssize_t start, Py_ssize_t end,
9584 int direction)
9585{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009586 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009587 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 if (PyUnicode_READY(str) == -1)
9589 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009590 len = PyUnicode_GET_LENGTH(str);
9591 ADJUST_INDICES(start, end, len);
9592 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009593 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009594 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009595 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9596 kind, end-start, ch, direction);
9597 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009599 else
9600 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009601}
9602
Alexander Belopolsky40018472011-02-26 01:02:56 +00009603static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009604tailmatch(PyObject *self,
9605 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009606 Py_ssize_t start,
9607 Py_ssize_t end,
9608 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009609{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009610 int kind_self;
9611 int kind_sub;
9612 void *data_self;
9613 void *data_sub;
9614 Py_ssize_t offset;
9615 Py_ssize_t i;
9616 Py_ssize_t end_sub;
9617
9618 if (PyUnicode_READY(self) == -1 ||
9619 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009620 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009621
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009622 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9623 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009624 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009625 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009627 if (PyUnicode_GET_LENGTH(substring) == 0)
9628 return 1;
9629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009630 kind_self = PyUnicode_KIND(self);
9631 data_self = PyUnicode_DATA(self);
9632 kind_sub = PyUnicode_KIND(substring);
9633 data_sub = PyUnicode_DATA(substring);
9634 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9635
9636 if (direction > 0)
9637 offset = end;
9638 else
9639 offset = start;
9640
9641 if (PyUnicode_READ(kind_self, data_self, offset) ==
9642 PyUnicode_READ(kind_sub, data_sub, 0) &&
9643 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9644 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9645 /* If both are of the same kind, memcmp is sufficient */
9646 if (kind_self == kind_sub) {
9647 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009648 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 data_sub,
9650 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009651 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009652 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009653 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 else {
9655 /* We do not need to compare 0 and len(substring)-1 because
9656 the if statement above ensured already that they are equal
9657 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 for (i = 1; i < end_sub; ++i) {
9659 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9660 PyUnicode_READ(kind_sub, data_sub, i))
9661 return 0;
9662 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009663 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009664 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009665 }
9666
9667 return 0;
9668}
9669
Alexander Belopolsky40018472011-02-26 01:02:56 +00009670Py_ssize_t
9671PyUnicode_Tailmatch(PyObject *str,
9672 PyObject *substr,
9673 Py_ssize_t start,
9674 Py_ssize_t end,
9675 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009676{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009677 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009678 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009679
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009680 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009681}
9682
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009683static PyObject *
9684ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009686 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9687 char *resdata, *data = PyUnicode_DATA(self);
9688 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009689
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 res = PyUnicode_New(len, 127);
9691 if (res == NULL)
9692 return NULL;
9693 resdata = PyUnicode_DATA(res);
9694 if (lower)
9695 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009696 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009697 _Py_bytes_upper(resdata, data, len);
9698 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009699}
9700
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009701static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009702handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704 Py_ssize_t j;
9705 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009706 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009707 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009708
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9710
9711 where ! is a negation and \p{xxx} is a character with property xxx.
9712 */
9713 for (j = i - 1; j >= 0; j--) {
9714 c = PyUnicode_READ(kind, data, j);
9715 if (!_PyUnicode_IsCaseIgnorable(c))
9716 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009717 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009718 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9719 if (final_sigma) {
9720 for (j = i + 1; j < length; j++) {
9721 c = PyUnicode_READ(kind, data, j);
9722 if (!_PyUnicode_IsCaseIgnorable(c))
9723 break;
9724 }
9725 final_sigma = j == length || !_PyUnicode_IsCased(c);
9726 }
9727 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728}
9729
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009730static int
9731lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9732 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009733{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734 /* Obscure special case. */
9735 if (c == 0x3A3) {
9736 mapped[0] = handle_capital_sigma(kind, data, length, i);
9737 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009738 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009739 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740}
9741
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009742static Py_ssize_t
9743do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009745 Py_ssize_t i, k = 0;
9746 int n_res, j;
9747 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009748
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009750 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009751 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009752 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009753 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009754 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 for (i = 1; i < length; i++) {
9756 c = PyUnicode_READ(kind, data, i);
9757 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9758 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009759 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009760 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009761 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009762 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009763 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009764}
9765
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009766static Py_ssize_t
9767do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9768 Py_ssize_t i, k = 0;
9769
9770 for (i = 0; i < length; i++) {
9771 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9772 int n_res, j;
9773 if (Py_UNICODE_ISUPPER(c)) {
9774 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9775 }
9776 else if (Py_UNICODE_ISLOWER(c)) {
9777 n_res = _PyUnicode_ToUpperFull(c, mapped);
9778 }
9779 else {
9780 n_res = 1;
9781 mapped[0] = c;
9782 }
9783 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009784 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009785 res[k++] = mapped[j];
9786 }
9787 }
9788 return k;
9789}
9790
9791static Py_ssize_t
9792do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9793 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009794{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009795 Py_ssize_t i, k = 0;
9796
9797 for (i = 0; i < length; i++) {
9798 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9799 int n_res, j;
9800 if (lower)
9801 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9802 else
9803 n_res = _PyUnicode_ToUpperFull(c, mapped);
9804 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009805 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009806 res[k++] = mapped[j];
9807 }
9808 }
9809 return k;
9810}
9811
9812static Py_ssize_t
9813do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9814{
9815 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9816}
9817
9818static Py_ssize_t
9819do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9820{
9821 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9822}
9823
Benjamin Petersone51757f2012-01-12 21:10:29 -05009824static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009825do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9826{
9827 Py_ssize_t i, k = 0;
9828
9829 for (i = 0; i < length; i++) {
9830 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9831 Py_UCS4 mapped[3];
9832 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9833 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009834 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009835 res[k++] = mapped[j];
9836 }
9837 }
9838 return k;
9839}
9840
9841static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009842do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9843{
9844 Py_ssize_t i, k = 0;
9845 int previous_is_cased;
9846
9847 previous_is_cased = 0;
9848 for (i = 0; i < length; i++) {
9849 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9850 Py_UCS4 mapped[3];
9851 int n_res, j;
9852
9853 if (previous_is_cased)
9854 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9855 else
9856 n_res = _PyUnicode_ToTitleFull(c, mapped);
9857
9858 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009859 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009860 res[k++] = mapped[j];
9861 }
9862
9863 previous_is_cased = _PyUnicode_IsCased(c);
9864 }
9865 return k;
9866}
9867
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009868static PyObject *
9869case_operation(PyObject *self,
9870 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9871{
9872 PyObject *res = NULL;
9873 Py_ssize_t length, newlength = 0;
9874 int kind, outkind;
9875 void *data, *outdata;
9876 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9877
Benjamin Petersoneea48462012-01-16 14:28:50 -05009878 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009879
9880 kind = PyUnicode_KIND(self);
9881 data = PyUnicode_DATA(self);
9882 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009883 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009884 PyErr_SetString(PyExc_OverflowError, "string is too long");
9885 return NULL;
9886 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009887 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009888 if (tmp == NULL)
9889 return PyErr_NoMemory();
9890 newlength = perform(kind, data, length, tmp, &maxchar);
9891 res = PyUnicode_New(newlength, maxchar);
9892 if (res == NULL)
9893 goto leave;
9894 tmpend = tmp + newlength;
9895 outdata = PyUnicode_DATA(res);
9896 outkind = PyUnicode_KIND(res);
9897 switch (outkind) {
9898 case PyUnicode_1BYTE_KIND:
9899 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9900 break;
9901 case PyUnicode_2BYTE_KIND:
9902 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9903 break;
9904 case PyUnicode_4BYTE_KIND:
9905 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9906 break;
9907 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009908 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009909 }
9910 leave:
9911 PyMem_FREE(tmp);
9912 return res;
9913}
9914
Tim Peters8ce9f162004-08-27 01:49:32 +00009915PyObject *
9916PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009917{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009918 PyObject *res;
9919 PyObject *fseq;
9920 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009921 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009922
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009923 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009924 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009925 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009926 }
9927
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009928 /* NOTE: the following code can't call back into Python code,
9929 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009930 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009931
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009932 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009933 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009934 res = _PyUnicode_JoinArray(separator, items, seqlen);
9935 Py_DECREF(fseq);
9936 return res;
9937}
9938
9939PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009940_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009941{
9942 PyObject *res = NULL; /* the result */
9943 PyObject *sep = NULL;
9944 Py_ssize_t seplen;
9945 PyObject *item;
9946 Py_ssize_t sz, i, res_offset;
9947 Py_UCS4 maxchar;
9948 Py_UCS4 item_maxchar;
9949 int use_memcpy;
9950 unsigned char *res_data = NULL, *sep_data = NULL;
9951 PyObject *last_obj;
9952 unsigned int kind = 0;
9953
Tim Peters05eba1f2004-08-27 21:32:02 +00009954 /* If empty sequence, return u"". */
9955 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009956 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009957 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009958
Tim Peters05eba1f2004-08-27 21:32:02 +00009959 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009960 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009961 if (seqlen == 1) {
9962 if (PyUnicode_CheckExact(items[0])) {
9963 res = items[0];
9964 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009965 return res;
9966 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009967 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009968 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009969 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009970 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009971 /* Set up sep and seplen */
9972 if (separator == NULL) {
9973 /* fall back to a blank space separator */
9974 sep = PyUnicode_FromOrdinal(' ');
9975 if (!sep)
9976 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009977 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009978 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009979 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009980 else {
9981 if (!PyUnicode_Check(separator)) {
9982 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009983 "separator: expected str instance,"
9984 " %.80s found",
9985 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009986 goto onError;
9987 }
9988 if (PyUnicode_READY(separator))
9989 goto onError;
9990 sep = separator;
9991 seplen = PyUnicode_GET_LENGTH(separator);
9992 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9993 /* inc refcount to keep this code path symmetric with the
9994 above case of a blank separator */
9995 Py_INCREF(sep);
9996 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009997 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +00009998 }
9999
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010000 /* There are at least two things to join, or else we have a subclass
10001 * of str in the sequence.
10002 * Do a pre-pass to figure out the total amount of space we'll
10003 * need (sz), and see whether all argument are strings.
10004 */
10005 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010006#ifdef Py_DEBUG
10007 use_memcpy = 0;
10008#else
10009 use_memcpy = 1;
10010#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010011 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010012 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010013 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010014 if (!PyUnicode_Check(item)) {
10015 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010016 "sequence item %zd: expected str instance,"
10017 " %.80s found",
10018 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010019 goto onError;
10020 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010021 if (PyUnicode_READY(item) == -1)
10022 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010023 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010024 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010025 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010026 if (i != 0) {
10027 add_sz += seplen;
10028 }
10029 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010030 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010031 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010032 goto onError;
10033 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010034 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010035 if (use_memcpy && last_obj != NULL) {
10036 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10037 use_memcpy = 0;
10038 }
10039 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010040 }
Tim Petersced69f82003-09-16 20:30:58 +000010041
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010042 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010043 if (res == NULL)
10044 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010045
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010046 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010047#ifdef Py_DEBUG
10048 use_memcpy = 0;
10049#else
10050 if (use_memcpy) {
10051 res_data = PyUnicode_1BYTE_DATA(res);
10052 kind = PyUnicode_KIND(res);
10053 if (seplen != 0)
10054 sep_data = PyUnicode_1BYTE_DATA(sep);
10055 }
10056#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010057 if (use_memcpy) {
10058 for (i = 0; i < seqlen; ++i) {
10059 Py_ssize_t itemlen;
10060 item = items[i];
10061
10062 /* Copy item, and maybe the separator. */
10063 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010064 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010065 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010066 kind * seplen);
10067 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010068 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010069
10070 itemlen = PyUnicode_GET_LENGTH(item);
10071 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010072 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010073 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010074 kind * itemlen);
10075 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010076 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010077 }
10078 assert(res_data == PyUnicode_1BYTE_DATA(res)
10079 + kind * PyUnicode_GET_LENGTH(res));
10080 }
10081 else {
10082 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10083 Py_ssize_t itemlen;
10084 item = items[i];
10085
10086 /* Copy item, and maybe the separator. */
10087 if (i && seplen != 0) {
10088 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10089 res_offset += seplen;
10090 }
10091
10092 itemlen = PyUnicode_GET_LENGTH(item);
10093 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010094 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010095 res_offset += itemlen;
10096 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010097 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010098 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010099 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010100
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010101 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010102 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010104
Benjamin Peterson29060642009-01-31 22:14:21 +000010105 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010106 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010107 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108 return NULL;
10109}
10110
Victor Stinnerd3f08822012-05-29 12:57:52 +020010111void
10112_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10113 Py_UCS4 fill_char)
10114{
10115 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010116 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010117 assert(PyUnicode_IS_READY(unicode));
10118 assert(unicode_modifiable(unicode));
10119 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10120 assert(start >= 0);
10121 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010122 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010123}
10124
Victor Stinner3fe55312012-01-04 00:33:50 +010010125Py_ssize_t
10126PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10127 Py_UCS4 fill_char)
10128{
10129 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010130
10131 if (!PyUnicode_Check(unicode)) {
10132 PyErr_BadInternalCall();
10133 return -1;
10134 }
10135 if (PyUnicode_READY(unicode) == -1)
10136 return -1;
10137 if (unicode_check_modifiable(unicode))
10138 return -1;
10139
Victor Stinnerd3f08822012-05-29 12:57:52 +020010140 if (start < 0) {
10141 PyErr_SetString(PyExc_IndexError, "string index out of range");
10142 return -1;
10143 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010144 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10145 PyErr_SetString(PyExc_ValueError,
10146 "fill character is bigger than "
10147 "the string maximum character");
10148 return -1;
10149 }
10150
10151 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10152 length = Py_MIN(maxlen, length);
10153 if (length <= 0)
10154 return 0;
10155
Victor Stinnerd3f08822012-05-29 12:57:52 +020010156 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010157 return length;
10158}
10159
Victor Stinner9310abb2011-10-05 00:59:23 +020010160static PyObject *
10161pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010162 Py_ssize_t left,
10163 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010164 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010165{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 PyObject *u;
10167 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010168 int kind;
10169 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010170
10171 if (left < 0)
10172 left = 0;
10173 if (right < 0)
10174 right = 0;
10175
Victor Stinnerc4b49542011-12-11 22:44:26 +010010176 if (left == 0 && right == 0)
10177 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010178
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010179 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10180 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010181 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10182 return NULL;
10183 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010184 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010185 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010187 if (!u)
10188 return NULL;
10189
10190 kind = PyUnicode_KIND(u);
10191 data = PyUnicode_DATA(u);
10192 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010193 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010194 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010195 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010196 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010197 assert(_PyUnicode_CheckConsistency(u, 1));
10198 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010199}
10200
Alexander Belopolsky40018472011-02-26 01:02:56 +000010201PyObject *
10202PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010204 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010206 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208
Benjamin Petersonead6b532011-12-20 17:23:42 -060010209 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010210 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010211 if (PyUnicode_IS_ASCII(string))
10212 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010213 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010214 PyUnicode_GET_LENGTH(string), keepends);
10215 else
10216 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010219 break;
10220 case PyUnicode_2BYTE_KIND:
10221 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010222 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 PyUnicode_GET_LENGTH(string), keepends);
10224 break;
10225 case PyUnicode_4BYTE_KIND:
10226 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010227 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010228 PyUnicode_GET_LENGTH(string), keepends);
10229 break;
10230 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010231 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010233 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010234}
10235
Alexander Belopolsky40018472011-02-26 01:02:56 +000010236static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010237split(PyObject *self,
10238 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010239 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010240{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010241 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010242 void *buf1, *buf2;
10243 Py_ssize_t len1, len2;
10244 PyObject* out;
10245
Guido van Rossumd57fd912000-03-10 22:53:23 +000010246 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010247 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010249 if (PyUnicode_READY(self) == -1)
10250 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010251
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010252 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010253 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010255 if (PyUnicode_IS_ASCII(self))
10256 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010257 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010258 PyUnicode_GET_LENGTH(self), maxcount
10259 );
10260 else
10261 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010262 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010263 PyUnicode_GET_LENGTH(self), maxcount
10264 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010265 case PyUnicode_2BYTE_KIND:
10266 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010267 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010268 PyUnicode_GET_LENGTH(self), maxcount
10269 );
10270 case PyUnicode_4BYTE_KIND:
10271 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010272 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010273 PyUnicode_GET_LENGTH(self), maxcount
10274 );
10275 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010276 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 }
10278
10279 if (PyUnicode_READY(substring) == -1)
10280 return NULL;
10281
10282 kind1 = PyUnicode_KIND(self);
10283 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010284 len1 = PyUnicode_GET_LENGTH(self);
10285 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010286 if (kind1 < kind2 || len1 < len2) {
10287 out = PyList_New(1);
10288 if (out == NULL)
10289 return NULL;
10290 Py_INCREF(self);
10291 PyList_SET_ITEM(out, 0, self);
10292 return out;
10293 }
10294 buf1 = PyUnicode_DATA(self);
10295 buf2 = PyUnicode_DATA(substring);
10296 if (kind2 != kind1) {
10297 buf2 = _PyUnicode_AsKind(substring, kind1);
10298 if (!buf2)
10299 return NULL;
10300 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010301
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010302 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010304 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10305 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010306 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010307 else
10308 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010309 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010310 break;
10311 case PyUnicode_2BYTE_KIND:
10312 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 break;
10315 case PyUnicode_4BYTE_KIND:
10316 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010317 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 break;
10319 default:
10320 out = NULL;
10321 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010322 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010323 PyMem_Free(buf2);
10324 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010325}
10326
Alexander Belopolsky40018472011-02-26 01:02:56 +000010327static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010328rsplit(PyObject *self,
10329 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010330 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010331{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010332 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010333 void *buf1, *buf2;
10334 Py_ssize_t len1, len2;
10335 PyObject* out;
10336
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010337 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010338 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010339
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010340 if (PyUnicode_READY(self) == -1)
10341 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010342
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010343 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010344 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010346 if (PyUnicode_IS_ASCII(self))
10347 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010348 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010349 PyUnicode_GET_LENGTH(self), maxcount
10350 );
10351 else
10352 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010353 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010354 PyUnicode_GET_LENGTH(self), maxcount
10355 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010356 case PyUnicode_2BYTE_KIND:
10357 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010358 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010359 PyUnicode_GET_LENGTH(self), maxcount
10360 );
10361 case PyUnicode_4BYTE_KIND:
10362 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010363 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010364 PyUnicode_GET_LENGTH(self), maxcount
10365 );
10366 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010367 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 }
10369
10370 if (PyUnicode_READY(substring) == -1)
10371 return NULL;
10372
10373 kind1 = PyUnicode_KIND(self);
10374 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010375 len1 = PyUnicode_GET_LENGTH(self);
10376 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010377 if (kind1 < kind2 || len1 < len2) {
10378 out = PyList_New(1);
10379 if (out == NULL)
10380 return NULL;
10381 Py_INCREF(self);
10382 PyList_SET_ITEM(out, 0, self);
10383 return out;
10384 }
10385 buf1 = PyUnicode_DATA(self);
10386 buf2 = PyUnicode_DATA(substring);
10387 if (kind2 != kind1) {
10388 buf2 = _PyUnicode_AsKind(substring, kind1);
10389 if (!buf2)
10390 return NULL;
10391 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010393 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010395 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10396 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010397 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010398 else
10399 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010400 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010401 break;
10402 case PyUnicode_2BYTE_KIND:
10403 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 break;
10406 case PyUnicode_4BYTE_KIND:
10407 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010408 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 break;
10410 default:
10411 out = NULL;
10412 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010413 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010414 PyMem_Free(buf2);
10415 return out;
10416}
10417
10418static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010419anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10420 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010421{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010422 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010424 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10425 return asciilib_find(buf1, len1, buf2, len2, offset);
10426 else
10427 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010428 case PyUnicode_2BYTE_KIND:
10429 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10430 case PyUnicode_4BYTE_KIND:
10431 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10432 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010433 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010434}
10435
10436static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010437anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10438 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010439{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010440 switch (kind) {
10441 case PyUnicode_1BYTE_KIND:
10442 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10443 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10444 else
10445 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10446 case PyUnicode_2BYTE_KIND:
10447 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10448 case PyUnicode_4BYTE_KIND:
10449 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10450 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010451 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010452}
10453
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010454static void
10455replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10456 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10457{
10458 int kind = PyUnicode_KIND(u);
10459 void *data = PyUnicode_DATA(u);
10460 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10461 if (kind == PyUnicode_1BYTE_KIND) {
10462 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10463 (Py_UCS1 *)data + len,
10464 u1, u2, maxcount);
10465 }
10466 else if (kind == PyUnicode_2BYTE_KIND) {
10467 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10468 (Py_UCS2 *)data + len,
10469 u1, u2, maxcount);
10470 }
10471 else {
10472 assert(kind == PyUnicode_4BYTE_KIND);
10473 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10474 (Py_UCS4 *)data + len,
10475 u1, u2, maxcount);
10476 }
10477}
10478
Alexander Belopolsky40018472011-02-26 01:02:56 +000010479static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010480replace(PyObject *self, PyObject *str1,
10481 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010482{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 PyObject *u;
10484 char *sbuf = PyUnicode_DATA(self);
10485 char *buf1 = PyUnicode_DATA(str1);
10486 char *buf2 = PyUnicode_DATA(str2);
10487 int srelease = 0, release1 = 0, release2 = 0;
10488 int skind = PyUnicode_KIND(self);
10489 int kind1 = PyUnicode_KIND(str1);
10490 int kind2 = PyUnicode_KIND(str2);
10491 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10492 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10493 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010494 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010495 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010496
10497 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010498 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010499 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010500 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010501
Victor Stinner59de0ee2011-10-07 10:01:28 +020010502 if (str1 == str2)
10503 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010504
Victor Stinner49a0a212011-10-12 23:46:10 +020010505 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010506 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10507 if (maxchar < maxchar_str1)
10508 /* substring too wide to be present */
10509 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010510 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10511 /* Replacing str1 with str2 may cause a maxchar reduction in the
10512 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010513 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010514 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010515
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010516 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010517 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010519 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010521 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010522 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010523 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010524
Victor Stinner69ed0f42013-04-09 21:48:24 +020010525 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010526 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010527 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010528 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010529 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010530 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010531 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010533
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010534 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10535 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010536 }
10537 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010538 int rkind = skind;
10539 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010540 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010541
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 if (kind1 < rkind) {
10543 /* widen substring */
10544 buf1 = _PyUnicode_AsKind(str1, rkind);
10545 if (!buf1) goto error;
10546 release1 = 1;
10547 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010548 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010549 if (i < 0)
10550 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010551 if (rkind > kind2) {
10552 /* widen replacement */
10553 buf2 = _PyUnicode_AsKind(str2, rkind);
10554 if (!buf2) goto error;
10555 release2 = 1;
10556 }
10557 else if (rkind < kind2) {
10558 /* widen self and buf1 */
10559 rkind = kind2;
10560 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010561 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010562 sbuf = _PyUnicode_AsKind(self, rkind);
10563 if (!sbuf) goto error;
10564 srelease = 1;
10565 buf1 = _PyUnicode_AsKind(str1, rkind);
10566 if (!buf1) goto error;
10567 release1 = 1;
10568 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010569 u = PyUnicode_New(slen, maxchar);
10570 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010571 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010572 assert(PyUnicode_KIND(u) == rkind);
10573 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010574
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010575 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010576 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010577 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010578 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010581
10582 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010583 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010584 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010585 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010586 if (i == -1)
10587 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010588 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010589 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010590 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010592 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010593 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010594 }
10595 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010596 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010597 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 int rkind = skind;
10599 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010600
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010601 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010602 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 buf1 = _PyUnicode_AsKind(str1, rkind);
10604 if (!buf1) goto error;
10605 release1 = 1;
10606 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010607 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010608 if (n == 0)
10609 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010610 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010611 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 buf2 = _PyUnicode_AsKind(str2, rkind);
10613 if (!buf2) goto error;
10614 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010617 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 rkind = kind2;
10619 sbuf = _PyUnicode_AsKind(self, rkind);
10620 if (!sbuf) goto error;
10621 srelease = 1;
10622 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010623 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010624 buf1 = _PyUnicode_AsKind(str1, rkind);
10625 if (!buf1) goto error;
10626 release1 = 1;
10627 }
10628 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10629 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010630 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010631 PyErr_SetString(PyExc_OverflowError,
10632 "replace string is too long");
10633 goto error;
10634 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010635 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010636 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010637 _Py_INCREF_UNICODE_EMPTY();
10638 if (!unicode_empty)
10639 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010640 u = unicode_empty;
10641 goto done;
10642 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010643 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010644 PyErr_SetString(PyExc_OverflowError,
10645 "replace string is too long");
10646 goto error;
10647 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010648 u = PyUnicode_New(new_size, maxchar);
10649 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010650 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010651 assert(PyUnicode_KIND(u) == rkind);
10652 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010653 ires = i = 0;
10654 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010655 while (n-- > 0) {
10656 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010657 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010658 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010659 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010660 if (j == -1)
10661 break;
10662 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010663 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010664 memcpy(res + rkind * ires,
10665 sbuf + rkind * i,
10666 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010668 }
10669 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010670 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010671 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010675 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010680 memcpy(res + rkind * ires,
10681 sbuf + rkind * i,
10682 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010683 }
10684 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010685 /* interleave */
10686 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010687 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010689 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010691 if (--n <= 0)
10692 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010693 memcpy(res + rkind * ires,
10694 sbuf + rkind * i,
10695 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 ires++;
10697 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010698 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010699 memcpy(res + rkind * ires,
10700 sbuf + rkind * i,
10701 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010702 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010703 }
10704
10705 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010706 unicode_adjust_maxchar(&u);
10707 if (u == NULL)
10708 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010709 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010710
10711 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010712 if (srelease)
10713 PyMem_FREE(sbuf);
10714 if (release1)
10715 PyMem_FREE(buf1);
10716 if (release2)
10717 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010718 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010719 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010720
Benjamin Peterson29060642009-01-31 22:14:21 +000010721 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 if (srelease)
10724 PyMem_FREE(sbuf);
10725 if (release1)
10726 PyMem_FREE(buf1);
10727 if (release2)
10728 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010729 return unicode_result_unchanged(self);
10730
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010731 error:
10732 if (srelease && sbuf)
10733 PyMem_FREE(sbuf);
10734 if (release1 && buf1)
10735 PyMem_FREE(buf1);
10736 if (release2 && buf2)
10737 PyMem_FREE(buf2);
10738 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010739}
10740
10741/* --- Unicode Object Methods --------------------------------------------- */
10742
INADA Naoki3ae20562017-01-16 20:41:20 +090010743/*[clinic input]
10744str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010745
INADA Naoki3ae20562017-01-16 20:41:20 +090010746Return a version of the string where each word is titlecased.
10747
10748More specifically, words start with uppercased characters and all remaining
10749cased characters have lower case.
10750[clinic start generated code]*/
10751
10752static PyObject *
10753unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010754/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010755{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010756 if (PyUnicode_READY(self) == -1)
10757 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010758 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759}
10760
INADA Naoki3ae20562017-01-16 20:41:20 +090010761/*[clinic input]
10762str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763
INADA Naoki3ae20562017-01-16 20:41:20 +090010764Return a capitalized version of the string.
10765
10766More specifically, make the first character have upper case and the rest lower
10767case.
10768[clinic start generated code]*/
10769
10770static PyObject *
10771unicode_capitalize_impl(PyObject *self)
10772/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010773{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010774 if (PyUnicode_READY(self) == -1)
10775 return NULL;
10776 if (PyUnicode_GET_LENGTH(self) == 0)
10777 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010778 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010779}
10780
INADA Naoki3ae20562017-01-16 20:41:20 +090010781/*[clinic input]
10782str.casefold as unicode_casefold
10783
10784Return a version of the string suitable for caseless comparisons.
10785[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010786
10787static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010788unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010789/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010790{
10791 if (PyUnicode_READY(self) == -1)
10792 return NULL;
10793 if (PyUnicode_IS_ASCII(self))
10794 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010795 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010796}
10797
10798
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010799/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010800
10801static int
10802convert_uc(PyObject *obj, void *addr)
10803{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010805
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010806 if (!PyUnicode_Check(obj)) {
10807 PyErr_Format(PyExc_TypeError,
10808 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010809 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010810 return 0;
10811 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010812 if (PyUnicode_READY(obj) < 0)
10813 return 0;
10814 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010815 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010816 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010817 return 0;
10818 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010819 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010820 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010821}
10822
INADA Naoki3ae20562017-01-16 20:41:20 +090010823/*[clinic input]
10824str.center as unicode_center
10825
10826 width: Py_ssize_t
10827 fillchar: Py_UCS4 = ' '
10828 /
10829
10830Return a centered string of length width.
10831
10832Padding is done using the specified fill character (default is a space).
10833[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010834
10835static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010836unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10837/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010839 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840
Benjamin Petersonbac79492012-01-14 13:34:47 -050010841 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842 return NULL;
10843
Victor Stinnerc4b49542011-12-11 22:44:26 +010010844 if (PyUnicode_GET_LENGTH(self) >= width)
10845 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846
Victor Stinnerc4b49542011-12-11 22:44:26 +010010847 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848 left = marg / 2 + (marg & width & 1);
10849
Victor Stinner9310abb2011-10-05 00:59:23 +020010850 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010851}
10852
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010853/* This function assumes that str1 and str2 are readied by the caller. */
10854
Marc-André Lemburge5034372000-08-08 08:04:29 +000010855static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010856unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010857{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010858#define COMPARE(TYPE1, TYPE2) \
10859 do { \
10860 TYPE1* p1 = (TYPE1 *)data1; \
10861 TYPE2* p2 = (TYPE2 *)data2; \
10862 TYPE1* end = p1 + len; \
10863 Py_UCS4 c1, c2; \
10864 for (; p1 != end; p1++, p2++) { \
10865 c1 = *p1; \
10866 c2 = *p2; \
10867 if (c1 != c2) \
10868 return (c1 < c2) ? -1 : 1; \
10869 } \
10870 } \
10871 while (0)
10872
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010873 int kind1, kind2;
10874 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010875 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 kind1 = PyUnicode_KIND(str1);
10878 kind2 = PyUnicode_KIND(str2);
10879 data1 = PyUnicode_DATA(str1);
10880 data2 = PyUnicode_DATA(str2);
10881 len1 = PyUnicode_GET_LENGTH(str1);
10882 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010883 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010884
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010885 switch(kind1) {
10886 case PyUnicode_1BYTE_KIND:
10887 {
10888 switch(kind2) {
10889 case PyUnicode_1BYTE_KIND:
10890 {
10891 int cmp = memcmp(data1, data2, len);
10892 /* normalize result of memcmp() into the range [-1; 1] */
10893 if (cmp < 0)
10894 return -1;
10895 if (cmp > 0)
10896 return 1;
10897 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010898 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010899 case PyUnicode_2BYTE_KIND:
10900 COMPARE(Py_UCS1, Py_UCS2);
10901 break;
10902 case PyUnicode_4BYTE_KIND:
10903 COMPARE(Py_UCS1, Py_UCS4);
10904 break;
10905 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010906 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010907 }
10908 break;
10909 }
10910 case PyUnicode_2BYTE_KIND:
10911 {
10912 switch(kind2) {
10913 case PyUnicode_1BYTE_KIND:
10914 COMPARE(Py_UCS2, Py_UCS1);
10915 break;
10916 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010917 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010918 COMPARE(Py_UCS2, Py_UCS2);
10919 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010920 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010921 case PyUnicode_4BYTE_KIND:
10922 COMPARE(Py_UCS2, Py_UCS4);
10923 break;
10924 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010925 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010926 }
10927 break;
10928 }
10929 case PyUnicode_4BYTE_KIND:
10930 {
10931 switch(kind2) {
10932 case PyUnicode_1BYTE_KIND:
10933 COMPARE(Py_UCS4, Py_UCS1);
10934 break;
10935 case PyUnicode_2BYTE_KIND:
10936 COMPARE(Py_UCS4, Py_UCS2);
10937 break;
10938 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010939 {
10940#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10941 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10942 /* normalize result of wmemcmp() into the range [-1; 1] */
10943 if (cmp < 0)
10944 return -1;
10945 if (cmp > 0)
10946 return 1;
10947#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010948 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010949#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010950 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010951 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010952 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010953 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010954 }
10955 break;
10956 }
10957 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010958 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010959 }
10960
Victor Stinner770e19e2012-10-04 22:59:45 +020010961 if (len1 == len2)
10962 return 0;
10963 if (len1 < len2)
10964 return -1;
10965 else
10966 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010967
10968#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010969}
10970
Benjamin Peterson621b4302016-09-09 13:54:34 -070010971static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010972unicode_compare_eq(PyObject *str1, PyObject *str2)
10973{
10974 int kind;
10975 void *data1, *data2;
10976 Py_ssize_t len;
10977 int cmp;
10978
Victor Stinnere5567ad2012-10-23 02:48:49 +020010979 len = PyUnicode_GET_LENGTH(str1);
10980 if (PyUnicode_GET_LENGTH(str2) != len)
10981 return 0;
10982 kind = PyUnicode_KIND(str1);
10983 if (PyUnicode_KIND(str2) != kind)
10984 return 0;
10985 data1 = PyUnicode_DATA(str1);
10986 data2 = PyUnicode_DATA(str2);
10987
10988 cmp = memcmp(data1, data2, len * kind);
10989 return (cmp == 0);
10990}
10991
10992
Alexander Belopolsky40018472011-02-26 01:02:56 +000010993int
10994PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010995{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010996 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10997 if (PyUnicode_READY(left) == -1 ||
10998 PyUnicode_READY(right) == -1)
10999 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011000
11001 /* a string is equal to itself */
11002 if (left == right)
11003 return 0;
11004
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011005 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011006 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011007 PyErr_Format(PyExc_TypeError,
11008 "Can't compare %.100s and %.100s",
11009 left->ob_type->tp_name,
11010 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011011 return -1;
11012}
11013
Martin v. Löwis5b222132007-06-10 09:51:05 +000011014int
11015PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011017 Py_ssize_t i;
11018 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011020 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021
Victor Stinner910337b2011-10-03 03:20:16 +020011022 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011023 if (!PyUnicode_IS_READY(uni)) {
11024 const wchar_t *ws = _PyUnicode_WSTR(uni);
11025 /* Compare Unicode string and source character set string */
11026 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11027 if (chr != ustr[i])
11028 return (chr < ustr[i]) ? -1 : 1;
11029 }
11030 /* This check keeps Python strings that end in '\0' from comparing equal
11031 to C strings identical up to that point. */
11032 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11033 return 1; /* uni is longer */
11034 if (ustr[i])
11035 return -1; /* str is longer */
11036 return 0;
11037 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011038 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011039 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011040 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011041 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011042 size_t len, len2 = strlen(str);
11043 int cmp;
11044
11045 len = Py_MIN(len1, len2);
11046 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011047 if (cmp != 0) {
11048 if (cmp < 0)
11049 return -1;
11050 else
11051 return 1;
11052 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011053 if (len1 > len2)
11054 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011055 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011056 return -1; /* str is longer */
11057 return 0;
11058 }
11059 else {
11060 void *data = PyUnicode_DATA(uni);
11061 /* Compare Unicode string and source character set string */
11062 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011063 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011064 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11065 /* This check keeps Python strings that end in '\0' from comparing equal
11066 to C strings identical up to that point. */
11067 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11068 return 1; /* uni is longer */
11069 if (str[i])
11070 return -1; /* str is longer */
11071 return 0;
11072 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011073}
11074
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011075static int
11076non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11077{
11078 size_t i, len;
11079 const wchar_t *p;
11080 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11081 if (strlen(str) != len)
11082 return 0;
11083 p = _PyUnicode_WSTR(unicode);
11084 assert(p);
11085 for (i = 0; i < len; i++) {
11086 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011087 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011088 return 0;
11089 }
11090 return 1;
11091}
11092
11093int
11094_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11095{
11096 size_t len;
11097 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011098 assert(str);
11099#ifndef NDEBUG
11100 for (const char *p = str; *p; p++) {
11101 assert((unsigned char)*p < 128);
11102 }
11103#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011104 if (PyUnicode_READY(unicode) == -1) {
11105 /* Memory error or bad data */
11106 PyErr_Clear();
11107 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11108 }
11109 if (!PyUnicode_IS_ASCII(unicode))
11110 return 0;
11111 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11112 return strlen(str) == len &&
11113 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11114}
11115
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011116int
11117_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11118{
11119 PyObject *right_uni;
11120 Py_hash_t hash;
11121
11122 assert(_PyUnicode_CHECK(left));
11123 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011124#ifndef NDEBUG
11125 for (const char *p = right->string; *p; p++) {
11126 assert((unsigned char)*p < 128);
11127 }
11128#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011129
11130 if (PyUnicode_READY(left) == -1) {
11131 /* memory error or bad data */
11132 PyErr_Clear();
11133 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11134 }
11135
11136 if (!PyUnicode_IS_ASCII(left))
11137 return 0;
11138
11139 right_uni = _PyUnicode_FromId(right); /* borrowed */
11140 if (right_uni == NULL) {
11141 /* memory error or bad data */
11142 PyErr_Clear();
11143 return _PyUnicode_EqualToASCIIString(left, right->string);
11144 }
11145
11146 if (left == right_uni)
11147 return 1;
11148
11149 if (PyUnicode_CHECK_INTERNED(left))
11150 return 0;
11151
INADA Naoki7cc95f52018-01-28 02:07:09 +090011152 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011153 hash = _PyUnicode_HASH(left);
11154 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11155 return 0;
11156
11157 return unicode_compare_eq(left, right_uni);
11158}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011159
Alexander Belopolsky40018472011-02-26 01:02:56 +000011160PyObject *
11161PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011162{
11163 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011164
Victor Stinnere5567ad2012-10-23 02:48:49 +020011165 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11166 Py_RETURN_NOTIMPLEMENTED;
11167
11168 if (PyUnicode_READY(left) == -1 ||
11169 PyUnicode_READY(right) == -1)
11170 return NULL;
11171
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011172 if (left == right) {
11173 switch (op) {
11174 case Py_EQ:
11175 case Py_LE:
11176 case Py_GE:
11177 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011178 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011179 case Py_NE:
11180 case Py_LT:
11181 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011182 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011183 default:
11184 PyErr_BadArgument();
11185 return NULL;
11186 }
11187 }
11188 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011189 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011190 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011191 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011192 }
11193 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011194 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011195 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011196 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011197}
11198
Alexander Belopolsky40018472011-02-26 01:02:56 +000011199int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011200_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11201{
11202 return unicode_eq(aa, bb);
11203}
11204
11205int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011206PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011207{
Victor Stinner77282cb2013-04-14 19:22:47 +020011208 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011209 void *buf1, *buf2;
11210 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011211 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011212
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011213 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011214 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011215 "'in <string>' requires string as left operand, not %.100s",
11216 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011217 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011218 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011219 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011220 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 if (ensure_unicode(str) < 0)
11222 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011223
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011224 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 kind2 = PyUnicode_KIND(substr);
11226 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011227 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 len2 = PyUnicode_GET_LENGTH(substr);
11230 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011231 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011232 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011233 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011234 if (len2 == 1) {
11235 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11236 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011237 return result;
11238 }
11239 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011240 buf2 = _PyUnicode_AsKind(substr, kind1);
11241 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011242 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011243 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011244
Victor Stinner77282cb2013-04-14 19:22:47 +020011245 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246 case PyUnicode_1BYTE_KIND:
11247 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11248 break;
11249 case PyUnicode_2BYTE_KIND:
11250 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11251 break;
11252 case PyUnicode_4BYTE_KIND:
11253 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11254 break;
11255 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011256 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011257 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011258
Victor Stinner77282cb2013-04-14 19:22:47 +020011259 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011260 PyMem_Free(buf2);
11261
Guido van Rossum403d68b2000-03-13 15:55:09 +000011262 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011263}
11264
Guido van Rossumd57fd912000-03-10 22:53:23 +000011265/* Concat to string or Unicode object giving a new Unicode object. */
11266
Alexander Belopolsky40018472011-02-26 01:02:56 +000011267PyObject *
11268PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011270 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011271 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011272 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011274 if (ensure_unicode(left) < 0)
11275 return NULL;
11276
11277 if (!PyUnicode_Check(right)) {
11278 PyErr_Format(PyExc_TypeError,
11279 "can only concatenate str (not \"%.200s\") to str",
11280 right->ob_type->tp_name);
11281 return NULL;
11282 }
11283 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011284 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011285
11286 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011287 if (left == unicode_empty)
11288 return PyUnicode_FromObject(right);
11289 if (right == unicode_empty)
11290 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011291
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011292 left_len = PyUnicode_GET_LENGTH(left);
11293 right_len = PyUnicode_GET_LENGTH(right);
11294 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011295 PyErr_SetString(PyExc_OverflowError,
11296 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011297 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011298 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011300
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11302 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011303 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011304
Guido van Rossumd57fd912000-03-10 22:53:23 +000011305 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011306 result = PyUnicode_New(new_len, maxchar);
11307 if (result == NULL)
11308 return NULL;
11309 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11310 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11311 assert(_PyUnicode_CheckConsistency(result, 1));
11312 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011313}
11314
Walter Dörwald1ab83302007-05-18 17:15:44 +000011315void
Victor Stinner23e56682011-10-03 03:54:37 +020011316PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011317{
Victor Stinner23e56682011-10-03 03:54:37 +020011318 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011319 Py_UCS4 maxchar, maxchar2;
11320 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011321
11322 if (p_left == NULL) {
11323 if (!PyErr_Occurred())
11324 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011325 return;
11326 }
Victor Stinner23e56682011-10-03 03:54:37 +020011327 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011328 if (right == NULL || left == NULL
11329 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011330 if (!PyErr_Occurred())
11331 PyErr_BadInternalCall();
11332 goto error;
11333 }
11334
Benjamin Petersonbac79492012-01-14 13:34:47 -050011335 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011336 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011337 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011338 goto error;
11339
Victor Stinner488fa492011-12-12 00:01:39 +010011340 /* Shortcuts */
11341 if (left == unicode_empty) {
11342 Py_DECREF(left);
11343 Py_INCREF(right);
11344 *p_left = right;
11345 return;
11346 }
11347 if (right == unicode_empty)
11348 return;
11349
11350 left_len = PyUnicode_GET_LENGTH(left);
11351 right_len = PyUnicode_GET_LENGTH(right);
11352 if (left_len > PY_SSIZE_T_MAX - right_len) {
11353 PyErr_SetString(PyExc_OverflowError,
11354 "strings are too large to concat");
11355 goto error;
11356 }
11357 new_len = left_len + right_len;
11358
11359 if (unicode_modifiable(left)
11360 && PyUnicode_CheckExact(right)
11361 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011362 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11363 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011364 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011365 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011366 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11367 {
11368 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011369 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011370 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011371
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011372 /* copy 'right' into the newly allocated area of 'left' */
11373 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011374 }
Victor Stinner488fa492011-12-12 00:01:39 +010011375 else {
11376 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11377 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011378 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011379
Victor Stinner488fa492011-12-12 00:01:39 +010011380 /* Concat the two Unicode strings */
11381 res = PyUnicode_New(new_len, maxchar);
11382 if (res == NULL)
11383 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011384 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11385 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011386 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011387 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011388 }
11389 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011390 return;
11391
11392error:
Victor Stinner488fa492011-12-12 00:01:39 +010011393 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011394}
11395
11396void
11397PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11398{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011399 PyUnicode_Append(pleft, right);
11400 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011401}
11402
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011403/*
11404Wraps stringlib_parse_args_finds() and additionally ensures that the
11405first argument is a unicode object.
11406*/
11407
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011408static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011409parse_args_finds_unicode(const char * function_name, PyObject *args,
11410 PyObject **substring,
11411 Py_ssize_t *start, Py_ssize_t *end)
11412{
11413 if(stringlib_parse_args_finds(function_name, args, substring,
11414 start, end)) {
11415 if (ensure_unicode(*substring) < 0)
11416 return 0;
11417 return 1;
11418 }
11419 return 0;
11420}
11421
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011422PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011423 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011424\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011425Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011426string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011427interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428
11429static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011430unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011431{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011432 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011433 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011434 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011436 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011437 void *buf1, *buf2;
11438 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011440 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011441 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011443 kind1 = PyUnicode_KIND(self);
11444 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011445 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011446 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011447
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011448 len1 = PyUnicode_GET_LENGTH(self);
11449 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011451 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011452 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011453
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011454 buf1 = PyUnicode_DATA(self);
11455 buf2 = PyUnicode_DATA(substring);
11456 if (kind2 != kind1) {
11457 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011458 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011459 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011460 }
11461 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011462 case PyUnicode_1BYTE_KIND:
11463 iresult = ucs1lib_count(
11464 ((Py_UCS1*)buf1) + start, end - start,
11465 buf2, len2, PY_SSIZE_T_MAX
11466 );
11467 break;
11468 case PyUnicode_2BYTE_KIND:
11469 iresult = ucs2lib_count(
11470 ((Py_UCS2*)buf1) + start, end - start,
11471 buf2, len2, PY_SSIZE_T_MAX
11472 );
11473 break;
11474 case PyUnicode_4BYTE_KIND:
11475 iresult = ucs4lib_count(
11476 ((Py_UCS4*)buf1) + start, end - start,
11477 buf2, len2, PY_SSIZE_T_MAX
11478 );
11479 break;
11480 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011481 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011482 }
11483
11484 result = PyLong_FromSsize_t(iresult);
11485
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011486 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011487 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011488
Guido van Rossumd57fd912000-03-10 22:53:23 +000011489 return result;
11490}
11491
INADA Naoki3ae20562017-01-16 20:41:20 +090011492/*[clinic input]
11493str.encode as unicode_encode
11494
11495 encoding: str(c_default="NULL") = 'utf-8'
11496 The encoding in which to encode the string.
11497 errors: str(c_default="NULL") = 'strict'
11498 The error handling scheme to use for encoding errors.
11499 The default is 'strict' meaning that encoding errors raise a
11500 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11501 'xmlcharrefreplace' as well as any other name registered with
11502 codecs.register_error that can handle UnicodeEncodeErrors.
11503
11504Encode the string using the codec registered for encoding.
11505[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011506
11507static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011508unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011509/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011511 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011512}
11513
INADA Naoki3ae20562017-01-16 20:41:20 +090011514/*[clinic input]
11515str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011516
INADA Naoki3ae20562017-01-16 20:41:20 +090011517 tabsize: int = 8
11518
11519Return a copy where all tab characters are expanded using spaces.
11520
11521If tabsize is not given, a tab size of 8 characters is assumed.
11522[clinic start generated code]*/
11523
11524static PyObject *
11525unicode_expandtabs_impl(PyObject *self, int tabsize)
11526/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011527{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011528 Py_ssize_t i, j, line_pos, src_len, incr;
11529 Py_UCS4 ch;
11530 PyObject *u;
11531 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011532 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011533 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011534
Antoine Pitrou22425222011-10-04 19:10:51 +020011535 if (PyUnicode_READY(self) == -1)
11536 return NULL;
11537
Thomas Wouters7e474022000-07-16 12:04:32 +000011538 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011539 src_len = PyUnicode_GET_LENGTH(self);
11540 i = j = line_pos = 0;
11541 kind = PyUnicode_KIND(self);
11542 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011543 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011544 for (; i < src_len; i++) {
11545 ch = PyUnicode_READ(kind, src_data, i);
11546 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011547 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011548 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011549 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011551 goto overflow;
11552 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011553 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011554 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011558 goto overflow;
11559 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011561 if (ch == '\n' || ch == '\r')
11562 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011564 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011565 if (!found)
11566 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011567
Guido van Rossumd57fd912000-03-10 22:53:23 +000011568 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011569 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 if (!u)
11571 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011572 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011573
Antoine Pitroue71d5742011-10-04 15:55:09 +020011574 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 for (; i < src_len; i++) {
11577 ch = PyUnicode_READ(kind, src_data, i);
11578 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011579 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011580 incr = tabsize - (line_pos % tabsize);
11581 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011582 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011583 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011584 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011585 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011587 line_pos++;
11588 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011589 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011590 if (ch == '\n' || ch == '\r')
11591 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011592 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011593 }
11594 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011595 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011596
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011598 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11599 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011600}
11601
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011602PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011603 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604\n\
11605Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011606such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011607arguments start and end are interpreted as in slice notation.\n\
11608\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011609Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011610
11611static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011612unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011613{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011614 /* initialize variables to prevent gcc warning */
11615 PyObject *substring = NULL;
11616 Py_ssize_t start = 0;
11617 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011618 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011619
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011620 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011622
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011623 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011624 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011625
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011626 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011627
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 if (result == -2)
11629 return NULL;
11630
Christian Heimes217cfd12007-12-02 14:31:20 +000011631 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011632}
11633
11634static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011635unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011637 void *data;
11638 enum PyUnicode_Kind kind;
11639 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011640
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011641 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011642 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011643 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011645 if (PyUnicode_READY(self) == -1) {
11646 return NULL;
11647 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011648 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11649 PyErr_SetString(PyExc_IndexError, "string index out of range");
11650 return NULL;
11651 }
11652 kind = PyUnicode_KIND(self);
11653 data = PyUnicode_DATA(self);
11654 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011655 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011656}
11657
Guido van Rossumc2504932007-09-18 19:42:40 +000011658/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011659 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011660static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011661unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011663 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011664
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011665#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011666 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011667#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011668 if (_PyUnicode_HASH(self) != -1)
11669 return _PyUnicode_HASH(self);
11670 if (PyUnicode_READY(self) == -1)
11671 return -1;
animalizea1d14252019-01-02 20:16:06 +080011672
Christian Heimes985ecdc2013-11-20 11:46:18 +010011673 x = _Py_HashBytes(PyUnicode_DATA(self),
11674 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011675 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011676 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011677}
11678
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011679PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011680 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681\n\
oldkaa0735f2018-02-02 16:52:55 +080011682Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011683such that sub is contained within S[start:end]. Optional\n\
11684arguments start and end are interpreted as in slice notation.\n\
11685\n\
11686Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011687
11688static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011689unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011690{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011691 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011692 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011693 PyObject *substring = NULL;
11694 Py_ssize_t start = 0;
11695 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011697 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011699
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011700 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011701 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011702
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011703 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 if (result == -2)
11706 return NULL;
11707
Guido van Rossumd57fd912000-03-10 22:53:23 +000011708 if (result < 0) {
11709 PyErr_SetString(PyExc_ValueError, "substring not found");
11710 return NULL;
11711 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011712
Christian Heimes217cfd12007-12-02 14:31:20 +000011713 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011714}
11715
INADA Naoki3ae20562017-01-16 20:41:20 +090011716/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011717str.isascii as unicode_isascii
11718
11719Return True if all characters in the string are ASCII, False otherwise.
11720
11721ASCII characters have code points in the range U+0000-U+007F.
11722Empty string is ASCII too.
11723[clinic start generated code]*/
11724
11725static PyObject *
11726unicode_isascii_impl(PyObject *self)
11727/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11728{
11729 if (PyUnicode_READY(self) == -1) {
11730 return NULL;
11731 }
11732 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11733}
11734
11735/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011736str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011737
INADA Naoki3ae20562017-01-16 20:41:20 +090011738Return True if the string is a lowercase string, False otherwise.
11739
11740A string is lowercase if all cased characters in the string are lowercase and
11741there is at least one cased character in the string.
11742[clinic start generated code]*/
11743
11744static PyObject *
11745unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011746/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011748 Py_ssize_t i, length;
11749 int kind;
11750 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751 int cased;
11752
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753 if (PyUnicode_READY(self) == -1)
11754 return NULL;
11755 length = PyUnicode_GET_LENGTH(self);
11756 kind = PyUnicode_KIND(self);
11757 data = PyUnicode_DATA(self);
11758
Guido van Rossumd57fd912000-03-10 22:53:23 +000011759 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011760 if (length == 1)
11761 return PyBool_FromLong(
11762 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011764 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011765 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011766 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011767
Guido van Rossumd57fd912000-03-10 22:53:23 +000011768 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 for (i = 0; i < length; i++) {
11770 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011771
Benjamin Peterson29060642009-01-31 22:14:21 +000011772 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011773 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 else if (!cased && Py_UNICODE_ISLOWER(ch))
11775 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011776 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011777 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778}
11779
INADA Naoki3ae20562017-01-16 20:41:20 +090011780/*[clinic input]
11781str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782
INADA Naoki3ae20562017-01-16 20:41:20 +090011783Return True if the string is an uppercase string, False otherwise.
11784
11785A string is uppercase if all cased characters in the string are uppercase and
11786there is at least one cased character in the string.
11787[clinic start generated code]*/
11788
11789static PyObject *
11790unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011791/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011792{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011793 Py_ssize_t i, length;
11794 int kind;
11795 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796 int cased;
11797
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011798 if (PyUnicode_READY(self) == -1)
11799 return NULL;
11800 length = PyUnicode_GET_LENGTH(self);
11801 kind = PyUnicode_KIND(self);
11802 data = PyUnicode_DATA(self);
11803
Guido van Rossumd57fd912000-03-10 22:53:23 +000011804 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011805 if (length == 1)
11806 return PyBool_FromLong(
11807 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011809 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011810 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011811 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011812
Guido van Rossumd57fd912000-03-10 22:53:23 +000011813 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 for (i = 0; i < length; i++) {
11815 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011816
Benjamin Peterson29060642009-01-31 22:14:21 +000011817 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011818 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 else if (!cased && Py_UNICODE_ISUPPER(ch))
11820 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011821 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011822 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823}
11824
INADA Naoki3ae20562017-01-16 20:41:20 +090011825/*[clinic input]
11826str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
INADA Naoki3ae20562017-01-16 20:41:20 +090011828Return True if the string is a title-cased string, False otherwise.
11829
11830In a title-cased string, upper- and title-case characters may only
11831follow uncased characters and lowercase characters only cased ones.
11832[clinic start generated code]*/
11833
11834static PyObject *
11835unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011836/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011837{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011838 Py_ssize_t i, length;
11839 int kind;
11840 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841 int cased, previous_is_cased;
11842
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011843 if (PyUnicode_READY(self) == -1)
11844 return NULL;
11845 length = PyUnicode_GET_LENGTH(self);
11846 kind = PyUnicode_KIND(self);
11847 data = PyUnicode_DATA(self);
11848
Guido van Rossumd57fd912000-03-10 22:53:23 +000011849 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011850 if (length == 1) {
11851 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11852 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11853 (Py_UNICODE_ISUPPER(ch) != 0));
11854 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011855
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011856 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011857 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011858 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011859
Guido van Rossumd57fd912000-03-10 22:53:23 +000011860 cased = 0;
11861 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011862 for (i = 0; i < length; i++) {
11863 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011864
Benjamin Peterson29060642009-01-31 22:14:21 +000011865 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11866 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011867 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011868 previous_is_cased = 1;
11869 cased = 1;
11870 }
11871 else if (Py_UNICODE_ISLOWER(ch)) {
11872 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011873 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011874 previous_is_cased = 1;
11875 cased = 1;
11876 }
11877 else
11878 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011880 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881}
11882
INADA Naoki3ae20562017-01-16 20:41:20 +090011883/*[clinic input]
11884str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885
INADA Naoki3ae20562017-01-16 20:41:20 +090011886Return True if the string is a whitespace string, False otherwise.
11887
11888A string is whitespace if all characters in the string are whitespace and there
11889is at least one character in the string.
11890[clinic start generated code]*/
11891
11892static PyObject *
11893unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011894/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011895{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011896 Py_ssize_t i, length;
11897 int kind;
11898 void *data;
11899
11900 if (PyUnicode_READY(self) == -1)
11901 return NULL;
11902 length = PyUnicode_GET_LENGTH(self);
11903 kind = PyUnicode_KIND(self);
11904 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011905
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011907 if (length == 1)
11908 return PyBool_FromLong(
11909 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011911 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011912 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011913 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011914
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011915 for (i = 0; i < length; i++) {
11916 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011917 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011918 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011919 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011920 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921}
11922
INADA Naoki3ae20562017-01-16 20:41:20 +090011923/*[clinic input]
11924str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011925
INADA Naoki3ae20562017-01-16 20:41:20 +090011926Return True if the string is an alphabetic string, False otherwise.
11927
11928A string is alphabetic if all characters in the string are alphabetic and there
11929is at least one character in the string.
11930[clinic start generated code]*/
11931
11932static PyObject *
11933unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011934/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011935{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011936 Py_ssize_t i, length;
11937 int kind;
11938 void *data;
11939
11940 if (PyUnicode_READY(self) == -1)
11941 return NULL;
11942 length = PyUnicode_GET_LENGTH(self);
11943 kind = PyUnicode_KIND(self);
11944 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011945
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011946 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011947 if (length == 1)
11948 return PyBool_FromLong(
11949 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011950
11951 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011952 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011953 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011954
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011955 for (i = 0; i < length; i++) {
11956 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011957 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011958 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011959 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011960}
11961
INADA Naoki3ae20562017-01-16 20:41:20 +090011962/*[clinic input]
11963str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011964
INADA Naoki3ae20562017-01-16 20:41:20 +090011965Return True if the string is an alpha-numeric string, False otherwise.
11966
11967A string is alpha-numeric if all characters in the string are alpha-numeric and
11968there is at least one character in the string.
11969[clinic start generated code]*/
11970
11971static PyObject *
11972unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011973/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011974{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011975 int kind;
11976 void *data;
11977 Py_ssize_t len, i;
11978
11979 if (PyUnicode_READY(self) == -1)
11980 return NULL;
11981
11982 kind = PyUnicode_KIND(self);
11983 data = PyUnicode_DATA(self);
11984 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011985
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011986 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011987 if (len == 1) {
11988 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11989 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11990 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011991
11992 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011993 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011994 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011995
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011996 for (i = 0; i < len; i++) {
11997 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011998 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011999 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012000 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012001 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002}
12003
INADA Naoki3ae20562017-01-16 20:41:20 +090012004/*[clinic input]
12005str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012006
INADA Naoki3ae20562017-01-16 20:41:20 +090012007Return True if the string is a decimal string, False otherwise.
12008
12009A string is a decimal string if all characters in the string are decimal and
12010there is at least one character in the string.
12011[clinic start generated code]*/
12012
12013static PyObject *
12014unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012015/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012016{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012017 Py_ssize_t i, length;
12018 int kind;
12019 void *data;
12020
12021 if (PyUnicode_READY(self) == -1)
12022 return NULL;
12023 length = PyUnicode_GET_LENGTH(self);
12024 kind = PyUnicode_KIND(self);
12025 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012026
Guido van Rossumd57fd912000-03-10 22:53:23 +000012027 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012028 if (length == 1)
12029 return PyBool_FromLong(
12030 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012032 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012033 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012034 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012035
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012036 for (i = 0; i < length; i++) {
12037 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012038 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012039 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012040 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041}
12042
INADA Naoki3ae20562017-01-16 20:41:20 +090012043/*[clinic input]
12044str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045
INADA Naoki3ae20562017-01-16 20:41:20 +090012046Return True if the string is a digit string, False otherwise.
12047
12048A string is a digit string if all characters in the string are digits and there
12049is at least one character in the string.
12050[clinic start generated code]*/
12051
12052static PyObject *
12053unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012054/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012055{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012056 Py_ssize_t i, length;
12057 int kind;
12058 void *data;
12059
12060 if (PyUnicode_READY(self) == -1)
12061 return NULL;
12062 length = PyUnicode_GET_LENGTH(self);
12063 kind = PyUnicode_KIND(self);
12064 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012065
Guido van Rossumd57fd912000-03-10 22:53:23 +000012066 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012067 if (length == 1) {
12068 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12069 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12070 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012071
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012072 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012073 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012074 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012076 for (i = 0; i < length; i++) {
12077 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012078 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012079 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012080 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081}
12082
INADA Naoki3ae20562017-01-16 20:41:20 +090012083/*[clinic input]
12084str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085
INADA Naoki3ae20562017-01-16 20:41:20 +090012086Return True if the string is a numeric string, False otherwise.
12087
12088A string is numeric if all characters in the string are numeric and there is at
12089least one character in the string.
12090[clinic start generated code]*/
12091
12092static PyObject *
12093unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012094/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012095{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012096 Py_ssize_t i, length;
12097 int kind;
12098 void *data;
12099
12100 if (PyUnicode_READY(self) == -1)
12101 return NULL;
12102 length = PyUnicode_GET_LENGTH(self);
12103 kind = PyUnicode_KIND(self);
12104 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012105
Guido van Rossumd57fd912000-03-10 22:53:23 +000012106 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012107 if (length == 1)
12108 return PyBool_FromLong(
12109 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012111 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012112 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012113 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012114
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 for (i = 0; i < length; i++) {
12116 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012117 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012118 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012119 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120}
12121
Martin v. Löwis47383402007-08-15 07:32:56 +000012122int
12123PyUnicode_IsIdentifier(PyObject *self)
12124{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012125 int kind;
12126 void *data;
12127 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012128 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012129
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012130 if (PyUnicode_READY(self) == -1) {
12131 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012132 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012133 }
12134
12135 /* Special case for empty strings */
12136 if (PyUnicode_GET_LENGTH(self) == 0)
12137 return 0;
12138 kind = PyUnicode_KIND(self);
12139 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012140
12141 /* PEP 3131 says that the first character must be in
12142 XID_Start and subsequent characters in XID_Continue,
12143 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012144 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012145 letters, digits, underscore). However, given the current
12146 definition of XID_Start and XID_Continue, it is sufficient
12147 to check just for these, except that _ must be allowed
12148 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012149 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012150 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012151 return 0;
12152
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012153 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012154 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012155 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012156 return 1;
12157}
12158
INADA Naoki3ae20562017-01-16 20:41:20 +090012159/*[clinic input]
12160str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012161
INADA Naoki3ae20562017-01-16 20:41:20 +090012162Return True if the string is a valid Python identifier, False otherwise.
12163
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012164Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012165such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012166[clinic start generated code]*/
12167
12168static PyObject *
12169unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012170/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012171{
12172 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12173}
12174
INADA Naoki3ae20562017-01-16 20:41:20 +090012175/*[clinic input]
12176str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012177
INADA Naoki3ae20562017-01-16 20:41:20 +090012178Return True if the string is printable, False otherwise.
12179
12180A string is printable if all of its characters are considered printable in
12181repr() or if it is empty.
12182[clinic start generated code]*/
12183
12184static PyObject *
12185unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012186/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012187{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012188 Py_ssize_t i, length;
12189 int kind;
12190 void *data;
12191
12192 if (PyUnicode_READY(self) == -1)
12193 return NULL;
12194 length = PyUnicode_GET_LENGTH(self);
12195 kind = PyUnicode_KIND(self);
12196 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012197
12198 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012199 if (length == 1)
12200 return PyBool_FromLong(
12201 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012202
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 for (i = 0; i < length; i++) {
12204 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012205 Py_RETURN_FALSE;
12206 }
12207 }
12208 Py_RETURN_TRUE;
12209}
12210
INADA Naoki3ae20562017-01-16 20:41:20 +090012211/*[clinic input]
12212str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213
INADA Naoki3ae20562017-01-16 20:41:20 +090012214 iterable: object
12215 /
12216
12217Concatenate any number of strings.
12218
Martin Panter91a88662017-01-24 00:30:06 +000012219The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012220The result is returned as a new string.
12221
12222Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12223[clinic start generated code]*/
12224
12225static PyObject *
12226unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012227/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012228{
INADA Naoki3ae20562017-01-16 20:41:20 +090012229 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230}
12231
Martin v. Löwis18e16552006-02-15 17:27:45 +000012232static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012233unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 if (PyUnicode_READY(self) == -1)
12236 return -1;
12237 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238}
12239
INADA Naoki3ae20562017-01-16 20:41:20 +090012240/*[clinic input]
12241str.ljust as unicode_ljust
12242
12243 width: Py_ssize_t
12244 fillchar: Py_UCS4 = ' '
12245 /
12246
12247Return a left-justified string of length width.
12248
12249Padding is done using the specified fill character (default is a space).
12250[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012251
12252static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012253unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12254/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012256 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012257 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012258
Victor Stinnerc4b49542011-12-11 22:44:26 +010012259 if (PyUnicode_GET_LENGTH(self) >= width)
12260 return unicode_result_unchanged(self);
12261
12262 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012263}
12264
INADA Naoki3ae20562017-01-16 20:41:20 +090012265/*[clinic input]
12266str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267
INADA Naoki3ae20562017-01-16 20:41:20 +090012268Return a copy of the string converted to lowercase.
12269[clinic start generated code]*/
12270
12271static PyObject *
12272unicode_lower_impl(PyObject *self)
12273/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012274{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012275 if (PyUnicode_READY(self) == -1)
12276 return NULL;
12277 if (PyUnicode_IS_ASCII(self))
12278 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012279 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012280}
12281
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012282#define LEFTSTRIP 0
12283#define RIGHTSTRIP 1
12284#define BOTHSTRIP 2
12285
12286/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012287static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012288
INADA Naoki3ae20562017-01-16 20:41:20 +090012289#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012291/* externally visible for str.strip(unicode) */
12292PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012293_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012294{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012295 void *data;
12296 int kind;
12297 Py_ssize_t i, j, len;
12298 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012299 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012301 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12302 return NULL;
12303
12304 kind = PyUnicode_KIND(self);
12305 data = PyUnicode_DATA(self);
12306 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012307 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012308 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12309 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012310 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012311
Benjamin Peterson14339b62009-01-31 16:36:08 +000012312 i = 0;
12313 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012314 while (i < len) {
12315 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12316 if (!BLOOM(sepmask, ch))
12317 break;
12318 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12319 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012320 i++;
12321 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012322 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012323
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 j = len;
12325 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012326 j--;
12327 while (j >= i) {
12328 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12329 if (!BLOOM(sepmask, ch))
12330 break;
12331 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12332 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012333 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012334 }
12335
Benjamin Peterson29060642009-01-31 22:14:21 +000012336 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012337 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012338
Victor Stinner7931d9a2011-11-04 00:22:48 +010012339 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012340}
12341
12342PyObject*
12343PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12344{
12345 unsigned char *data;
12346 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012347 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012348
Victor Stinnerde636f32011-10-01 03:55:54 +020012349 if (PyUnicode_READY(self) == -1)
12350 return NULL;
12351
Victor Stinner684d5fd2012-05-03 02:32:34 +020012352 length = PyUnicode_GET_LENGTH(self);
12353 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012354
Victor Stinner684d5fd2012-05-03 02:32:34 +020012355 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012356 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012357
Victor Stinnerde636f32011-10-01 03:55:54 +020012358 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012359 PyErr_SetString(PyExc_IndexError, "string index out of range");
12360 return NULL;
12361 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012362 if (start >= length || end < start)
12363 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012364
Victor Stinner684d5fd2012-05-03 02:32:34 +020012365 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012366 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012367 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012368 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012369 }
12370 else {
12371 kind = PyUnicode_KIND(self);
12372 data = PyUnicode_1BYTE_DATA(self);
12373 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012374 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012375 length);
12376 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012378
12379static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012380do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012381{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012382 Py_ssize_t len, i, j;
12383
12384 if (PyUnicode_READY(self) == -1)
12385 return NULL;
12386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012387 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012388
Victor Stinnercc7af722013-04-09 22:39:24 +020012389 if (PyUnicode_IS_ASCII(self)) {
12390 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12391
12392 i = 0;
12393 if (striptype != RIGHTSTRIP) {
12394 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012395 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012396 if (!_Py_ascii_whitespace[ch])
12397 break;
12398 i++;
12399 }
12400 }
12401
12402 j = len;
12403 if (striptype != LEFTSTRIP) {
12404 j--;
12405 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012406 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012407 if (!_Py_ascii_whitespace[ch])
12408 break;
12409 j--;
12410 }
12411 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012412 }
12413 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012414 else {
12415 int kind = PyUnicode_KIND(self);
12416 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012417
Victor Stinnercc7af722013-04-09 22:39:24 +020012418 i = 0;
12419 if (striptype != RIGHTSTRIP) {
12420 while (i < len) {
12421 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12422 if (!Py_UNICODE_ISSPACE(ch))
12423 break;
12424 i++;
12425 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012426 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012427
12428 j = len;
12429 if (striptype != LEFTSTRIP) {
12430 j--;
12431 while (j >= i) {
12432 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12433 if (!Py_UNICODE_ISSPACE(ch))
12434 break;
12435 j--;
12436 }
12437 j++;
12438 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012439 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012440
Victor Stinner7931d9a2011-11-04 00:22:48 +010012441 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012442}
12443
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012444
12445static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012446do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012447{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012448 if (sep != NULL && sep != Py_None) {
12449 if (PyUnicode_Check(sep))
12450 return _PyUnicode_XStrip(self, striptype, sep);
12451 else {
12452 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012453 "%s arg must be None or str",
12454 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012455 return NULL;
12456 }
12457 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012458
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460}
12461
12462
INADA Naoki3ae20562017-01-16 20:41:20 +090012463/*[clinic input]
12464str.strip as unicode_strip
12465
12466 chars: object = None
12467 /
12468
Victor Stinner0c4a8282017-01-17 02:21:47 +010012469Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012470
12471If chars is given and not None, remove characters in chars instead.
12472[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012473
12474static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012475unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012476/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012477{
INADA Naoki3ae20562017-01-16 20:41:20 +090012478 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012479}
12480
12481
INADA Naoki3ae20562017-01-16 20:41:20 +090012482/*[clinic input]
12483str.lstrip as unicode_lstrip
12484
12485 chars: object = NULL
12486 /
12487
12488Return a copy of the string with leading whitespace removed.
12489
12490If chars is given and not None, remove characters in chars instead.
12491[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012492
12493static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012494unicode_lstrip_impl(PyObject *self, PyObject *chars)
12495/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012496{
INADA Naoki3ae20562017-01-16 20:41:20 +090012497 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012498}
12499
12500
INADA Naoki3ae20562017-01-16 20:41:20 +090012501/*[clinic input]
12502str.rstrip as unicode_rstrip
12503
12504 chars: object = NULL
12505 /
12506
12507Return a copy of the string with trailing whitespace removed.
12508
12509If chars is given and not None, remove characters in chars instead.
12510[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012511
12512static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012513unicode_rstrip_impl(PyObject *self, PyObject *chars)
12514/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012515{
INADA Naoki3ae20562017-01-16 20:41:20 +090012516 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012517}
12518
12519
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012521unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012523 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012525
Serhiy Storchaka05997252013-01-26 12:14:02 +020012526 if (len < 1)
12527 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012528
Victor Stinnerc4b49542011-12-11 22:44:26 +010012529 /* no repeat, return original string */
12530 if (len == 1)
12531 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012532
Benjamin Petersonbac79492012-01-14 13:34:47 -050012533 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012534 return NULL;
12535
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012536 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012537 PyErr_SetString(PyExc_OverflowError,
12538 "repeated string is too long");
12539 return NULL;
12540 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012541 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012542
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012543 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012544 if (!u)
12545 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012546 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012547
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012548 if (PyUnicode_GET_LENGTH(str) == 1) {
12549 const int kind = PyUnicode_KIND(str);
12550 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012551 if (kind == PyUnicode_1BYTE_KIND) {
12552 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012553 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012554 }
12555 else if (kind == PyUnicode_2BYTE_KIND) {
12556 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012557 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012558 ucs2[n] = fill_char;
12559 } else {
12560 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12561 assert(kind == PyUnicode_4BYTE_KIND);
12562 for (n = 0; n < len; ++n)
12563 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012564 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012565 }
12566 else {
12567 /* number of characters copied this far */
12568 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012569 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012570 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012571 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012573 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012575 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012576 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012578 }
12579
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012580 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012581 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582}
12583
Alexander Belopolsky40018472011-02-26 01:02:56 +000012584PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012585PyUnicode_Replace(PyObject *str,
12586 PyObject *substr,
12587 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012588 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012589{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012590 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12591 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012592 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012593 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012594}
12595
INADA Naoki3ae20562017-01-16 20:41:20 +090012596/*[clinic input]
12597str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598
INADA Naoki3ae20562017-01-16 20:41:20 +090012599 old: unicode
12600 new: unicode
12601 count: Py_ssize_t = -1
12602 Maximum number of occurrences to replace.
12603 -1 (the default value) means replace all occurrences.
12604 /
12605
12606Return a copy with all occurrences of substring old replaced by new.
12607
12608If the optional argument count is given, only the first count occurrences are
12609replaced.
12610[clinic start generated code]*/
12611
12612static PyObject *
12613unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12614 Py_ssize_t count)
12615/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012616{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012617 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012618 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012619 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620}
12621
Alexander Belopolsky40018472011-02-26 01:02:56 +000012622static PyObject *
12623unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012625 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012626 Py_ssize_t isize;
12627 Py_ssize_t osize, squote, dquote, i, o;
12628 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012629 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012633 return NULL;
12634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012635 isize = PyUnicode_GET_LENGTH(unicode);
12636 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012638 /* Compute length of output, quote characters, and
12639 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012640 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012641 max = 127;
12642 squote = dquote = 0;
12643 ikind = PyUnicode_KIND(unicode);
12644 for (i = 0; i < isize; i++) {
12645 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012646 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012647 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012648 case '\'': squote++; break;
12649 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012650 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012651 incr = 2;
12652 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012653 default:
12654 /* Fast-path ASCII */
12655 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012656 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012658 ;
12659 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012660 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012662 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012664 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012666 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012668 if (osize > PY_SSIZE_T_MAX - incr) {
12669 PyErr_SetString(PyExc_OverflowError,
12670 "string is too long to generate repr");
12671 return NULL;
12672 }
12673 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012674 }
12675
12676 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012677 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012679 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 if (dquote)
12681 /* Both squote and dquote present. Use squote,
12682 and escape them */
12683 osize += squote;
12684 else
12685 quote = '"';
12686 }
Victor Stinner55c08782013-04-14 18:45:39 +020012687 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012688
12689 repr = PyUnicode_New(osize, max);
12690 if (repr == NULL)
12691 return NULL;
12692 okind = PyUnicode_KIND(repr);
12693 odata = PyUnicode_DATA(repr);
12694
12695 PyUnicode_WRITE(okind, odata, 0, quote);
12696 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012697 if (unchanged) {
12698 _PyUnicode_FastCopyCharacters(repr, 1,
12699 unicode, 0,
12700 isize);
12701 }
12702 else {
12703 for (i = 0, o = 1; i < isize; i++) {
12704 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012705
Victor Stinner55c08782013-04-14 18:45:39 +020012706 /* Escape quotes and backslashes */
12707 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012708 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012710 continue;
12711 }
12712
12713 /* Map special whitespace to '\t', \n', '\r' */
12714 if (ch == '\t') {
12715 PyUnicode_WRITE(okind, odata, o++, '\\');
12716 PyUnicode_WRITE(okind, odata, o++, 't');
12717 }
12718 else if (ch == '\n') {
12719 PyUnicode_WRITE(okind, odata, o++, '\\');
12720 PyUnicode_WRITE(okind, odata, o++, 'n');
12721 }
12722 else if (ch == '\r') {
12723 PyUnicode_WRITE(okind, odata, o++, '\\');
12724 PyUnicode_WRITE(okind, odata, o++, 'r');
12725 }
12726
12727 /* Map non-printable US ASCII to '\xhh' */
12728 else if (ch < ' ' || ch == 0x7F) {
12729 PyUnicode_WRITE(okind, odata, o++, '\\');
12730 PyUnicode_WRITE(okind, odata, o++, 'x');
12731 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12732 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12733 }
12734
12735 /* Copy ASCII characters as-is */
12736 else if (ch < 0x7F) {
12737 PyUnicode_WRITE(okind, odata, o++, ch);
12738 }
12739
12740 /* Non-ASCII characters */
12741 else {
12742 /* Map Unicode whitespace and control characters
12743 (categories Z* and C* except ASCII space)
12744 */
12745 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12746 PyUnicode_WRITE(okind, odata, o++, '\\');
12747 /* Map 8-bit characters to '\xhh' */
12748 if (ch <= 0xff) {
12749 PyUnicode_WRITE(okind, odata, o++, 'x');
12750 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12751 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12752 }
12753 /* Map 16-bit characters to '\uxxxx' */
12754 else if (ch <= 0xffff) {
12755 PyUnicode_WRITE(okind, odata, o++, 'u');
12756 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12757 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12758 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12760 }
12761 /* Map 21-bit characters to '\U00xxxxxx' */
12762 else {
12763 PyUnicode_WRITE(okind, odata, o++, 'U');
12764 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12765 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12772 }
12773 }
12774 /* Copy characters as-is */
12775 else {
12776 PyUnicode_WRITE(okind, odata, o++, ch);
12777 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012778 }
12779 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012780 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012781 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012782 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012783 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012784}
12785
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012786PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012787 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788\n\
12789Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012790such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012791arguments start and end are interpreted as in slice notation.\n\
12792\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012793Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012794
12795static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012796unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012797{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012798 /* initialize variables to prevent gcc warning */
12799 PyObject *substring = NULL;
12800 Py_ssize_t start = 0;
12801 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012802 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012803
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012804 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012805 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012806
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012807 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012808 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012810 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012811
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 if (result == -2)
12813 return NULL;
12814
Christian Heimes217cfd12007-12-02 14:31:20 +000012815 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012816}
12817
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012818PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012819 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012821Return the highest index in S where substring sub is found,\n\
12822such that sub is contained within S[start:end]. Optional\n\
12823arguments start and end are interpreted as in slice notation.\n\
12824\n\
12825Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012826
12827static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012828unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012829{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012830 /* initialize variables to prevent gcc warning */
12831 PyObject *substring = NULL;
12832 Py_ssize_t start = 0;
12833 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012834 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012835
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012836 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012837 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012838
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012839 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012840 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012841
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012842 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012843
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 if (result == -2)
12845 return NULL;
12846
Guido van Rossumd57fd912000-03-10 22:53:23 +000012847 if (result < 0) {
12848 PyErr_SetString(PyExc_ValueError, "substring not found");
12849 return NULL;
12850 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851
Christian Heimes217cfd12007-12-02 14:31:20 +000012852 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012853}
12854
INADA Naoki3ae20562017-01-16 20:41:20 +090012855/*[clinic input]
12856str.rjust as unicode_rjust
12857
12858 width: Py_ssize_t
12859 fillchar: Py_UCS4 = ' '
12860 /
12861
12862Return a right-justified string of length width.
12863
12864Padding is done using the specified fill character (default is a space).
12865[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012866
12867static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012868unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12869/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012871 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872 return NULL;
12873
Victor Stinnerc4b49542011-12-11 22:44:26 +010012874 if (PyUnicode_GET_LENGTH(self) >= width)
12875 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876
Victor Stinnerc4b49542011-12-11 22:44:26 +010012877 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878}
12879
Alexander Belopolsky40018472011-02-26 01:02:56 +000012880PyObject *
12881PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012883 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012884 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012885
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012886 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887}
12888
INADA Naoki3ae20562017-01-16 20:41:20 +090012889/*[clinic input]
12890str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891
INADA Naoki3ae20562017-01-16 20:41:20 +090012892 sep: object = None
12893 The delimiter according which to split the string.
12894 None (the default value) means split according to any whitespace,
12895 and discard empty strings from the result.
12896 maxsplit: Py_ssize_t = -1
12897 Maximum number of splits to do.
12898 -1 (the default value) means no limit.
12899
12900Return a list of the words in the string, using sep as the delimiter string.
12901[clinic start generated code]*/
12902
12903static PyObject *
12904unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12905/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012906{
INADA Naoki3ae20562017-01-16 20:41:20 +090012907 if (sep == Py_None)
12908 return split(self, NULL, maxsplit);
12909 if (PyUnicode_Check(sep))
12910 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012911
Victor Stinner998b8062018-09-12 00:23:25 +020012912 PyErr_Format(PyExc_TypeError,
12913 "must be str or None, not %.100s",
12914 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012915 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012916}
12917
Thomas Wouters477c8d52006-05-27 19:21:47 +000012918PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012919PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012921 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012922 int kind1, kind2;
12923 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012924 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012925
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012926 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012927 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012928
Victor Stinner14f8f022011-10-05 20:58:25 +020012929 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012930 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012931 len1 = PyUnicode_GET_LENGTH(str_obj);
12932 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012933 if (kind1 < kind2 || len1 < len2) {
12934 _Py_INCREF_UNICODE_EMPTY();
12935 if (!unicode_empty)
12936 out = NULL;
12937 else {
12938 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12939 Py_DECREF(unicode_empty);
12940 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012941 return out;
12942 }
12943 buf1 = PyUnicode_DATA(str_obj);
12944 buf2 = PyUnicode_DATA(sep_obj);
12945 if (kind2 != kind1) {
12946 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12947 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012948 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012949 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012951 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012953 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12954 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12955 else
12956 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012957 break;
12958 case PyUnicode_2BYTE_KIND:
12959 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12960 break;
12961 case PyUnicode_4BYTE_KIND:
12962 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12963 break;
12964 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012965 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012966 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012967
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012968 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012969 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012970
12971 return out;
12972}
12973
12974
12975PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012976PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012977{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012978 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012979 int kind1, kind2;
12980 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012981 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012982
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012983 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012984 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012986 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012987 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012988 len1 = PyUnicode_GET_LENGTH(str_obj);
12989 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012990 if (kind1 < kind2 || len1 < len2) {
12991 _Py_INCREF_UNICODE_EMPTY();
12992 if (!unicode_empty)
12993 out = NULL;
12994 else {
12995 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12996 Py_DECREF(unicode_empty);
12997 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012998 return out;
12999 }
13000 buf1 = PyUnicode_DATA(str_obj);
13001 buf2 = PyUnicode_DATA(sep_obj);
13002 if (kind2 != kind1) {
13003 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13004 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013005 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013006 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013007
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013010 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13011 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13012 else
13013 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013014 break;
13015 case PyUnicode_2BYTE_KIND:
13016 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13017 break;
13018 case PyUnicode_4BYTE_KIND:
13019 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13020 break;
13021 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013022 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013023 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013024
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013025 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013026 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013027
13028 return out;
13029}
13030
INADA Naoki3ae20562017-01-16 20:41:20 +090013031/*[clinic input]
13032str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013033
INADA Naoki3ae20562017-01-16 20:41:20 +090013034 sep: object
13035 /
13036
13037Partition the string into three parts using the given separator.
13038
13039This will search for the separator in the string. If the separator is found,
13040returns a 3-tuple containing the part before the separator, the separator
13041itself, and the part after it.
13042
13043If the separator is not found, returns a 3-tuple containing the original string
13044and two empty strings.
13045[clinic start generated code]*/
13046
13047static PyObject *
13048unicode_partition(PyObject *self, PyObject *sep)
13049/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013050{
INADA Naoki3ae20562017-01-16 20:41:20 +090013051 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052}
13053
INADA Naoki3ae20562017-01-16 20:41:20 +090013054/*[clinic input]
13055str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013056
INADA Naoki3ae20562017-01-16 20:41:20 +090013057Partition the string into three parts using the given separator.
13058
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013059This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013060the separator is found, returns a 3-tuple containing the part before the
13061separator, the separator itself, and the part after it.
13062
13063If the separator is not found, returns a 3-tuple containing two empty strings
13064and the original string.
13065[clinic start generated code]*/
13066
13067static PyObject *
13068unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013069/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013070{
INADA Naoki3ae20562017-01-16 20:41:20 +090013071 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013072}
13073
Alexander Belopolsky40018472011-02-26 01:02:56 +000013074PyObject *
13075PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013076{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013077 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013078 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013079
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013080 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013081}
13082
INADA Naoki3ae20562017-01-16 20:41:20 +090013083/*[clinic input]
13084str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013085
INADA Naoki3ae20562017-01-16 20:41:20 +090013086Return a list of the words in the string, using sep as the delimiter string.
13087
13088Splits are done starting at the end of the string and working to the front.
13089[clinic start generated code]*/
13090
13091static PyObject *
13092unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13093/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013094{
INADA Naoki3ae20562017-01-16 20:41:20 +090013095 if (sep == Py_None)
13096 return rsplit(self, NULL, maxsplit);
13097 if (PyUnicode_Check(sep))
13098 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013099
Victor Stinner998b8062018-09-12 00:23:25 +020013100 PyErr_Format(PyExc_TypeError,
13101 "must be str or None, not %.100s",
13102 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013103 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013104}
13105
INADA Naoki3ae20562017-01-16 20:41:20 +090013106/*[clinic input]
13107str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013108
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013109 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013110
13111Return a list of the lines in the string, breaking at line boundaries.
13112
13113Line breaks are not included in the resulting list unless keepends is given and
13114true.
13115[clinic start generated code]*/
13116
13117static PyObject *
13118unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013119/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013120{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013121 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122}
13123
13124static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013125PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013127 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128}
13129
INADA Naoki3ae20562017-01-16 20:41:20 +090013130/*[clinic input]
13131str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132
INADA Naoki3ae20562017-01-16 20:41:20 +090013133Convert uppercase characters to lowercase and lowercase characters to uppercase.
13134[clinic start generated code]*/
13135
13136static PyObject *
13137unicode_swapcase_impl(PyObject *self)
13138/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013139{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013140 if (PyUnicode_READY(self) == -1)
13141 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013142 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143}
13144
Larry Hastings61272b72014-01-07 12:41:53 -080013145/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013146
Larry Hastings31826802013-10-19 00:09:25 -070013147@staticmethod
13148str.maketrans as unicode_maketrans
13149
13150 x: object
13151
13152 y: unicode=NULL
13153
13154 z: unicode=NULL
13155
13156 /
13157
13158Return a translation table usable for str.translate().
13159
13160If there is only one argument, it must be a dictionary mapping Unicode
13161ordinals (integers) or characters to Unicode ordinals, strings or None.
13162Character keys will be then converted to ordinals.
13163If there are two arguments, they must be strings of equal length, and
13164in the resulting dictionary, each character in x will be mapped to the
13165character at the same position in y. If there is a third argument, it
13166must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013167[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013168
Larry Hastings31826802013-10-19 00:09:25 -070013169static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013170unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013171/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013172{
Georg Brandlceee0772007-11-27 23:48:05 +000013173 PyObject *new = NULL, *key, *value;
13174 Py_ssize_t i = 0;
13175 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013176
Georg Brandlceee0772007-11-27 23:48:05 +000013177 new = PyDict_New();
13178 if (!new)
13179 return NULL;
13180 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013181 int x_kind, y_kind, z_kind;
13182 void *x_data, *y_data, *z_data;
13183
Georg Brandlceee0772007-11-27 23:48:05 +000013184 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013185 if (!PyUnicode_Check(x)) {
13186 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13187 "be a string if there is a second argument");
13188 goto err;
13189 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013190 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013191 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13192 "arguments must have equal length");
13193 goto err;
13194 }
13195 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013196 x_kind = PyUnicode_KIND(x);
13197 y_kind = PyUnicode_KIND(y);
13198 x_data = PyUnicode_DATA(x);
13199 y_data = PyUnicode_DATA(y);
13200 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13201 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013202 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013203 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013204 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013205 if (!value) {
13206 Py_DECREF(key);
13207 goto err;
13208 }
Georg Brandlceee0772007-11-27 23:48:05 +000013209 res = PyDict_SetItem(new, key, value);
13210 Py_DECREF(key);
13211 Py_DECREF(value);
13212 if (res < 0)
13213 goto err;
13214 }
13215 /* create entries for deleting chars in z */
13216 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013217 z_kind = PyUnicode_KIND(z);
13218 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013219 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013220 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013221 if (!key)
13222 goto err;
13223 res = PyDict_SetItem(new, key, Py_None);
13224 Py_DECREF(key);
13225 if (res < 0)
13226 goto err;
13227 }
13228 }
13229 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013230 int kind;
13231 void *data;
13232
Georg Brandlceee0772007-11-27 23:48:05 +000013233 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013234 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013235 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13236 "to maketrans it must be a dict");
13237 goto err;
13238 }
13239 /* copy entries into the new dict, converting string keys to int keys */
13240 while (PyDict_Next(x, &i, &key, &value)) {
13241 if (PyUnicode_Check(key)) {
13242 /* convert string keys to integer keys */
13243 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013244 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013245 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13246 "table must be of length 1");
13247 goto err;
13248 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013249 kind = PyUnicode_KIND(key);
13250 data = PyUnicode_DATA(key);
13251 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013252 if (!newkey)
13253 goto err;
13254 res = PyDict_SetItem(new, newkey, value);
13255 Py_DECREF(newkey);
13256 if (res < 0)
13257 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013258 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013259 /* just keep integer keys */
13260 if (PyDict_SetItem(new, key, value) < 0)
13261 goto err;
13262 } else {
13263 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13264 "be strings or integers");
13265 goto err;
13266 }
13267 }
13268 }
13269 return new;
13270 err:
13271 Py_DECREF(new);
13272 return NULL;
13273}
13274
INADA Naoki3ae20562017-01-16 20:41:20 +090013275/*[clinic input]
13276str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013277
INADA Naoki3ae20562017-01-16 20:41:20 +090013278 table: object
13279 Translation table, which must be a mapping of Unicode ordinals to
13280 Unicode ordinals, strings, or None.
13281 /
13282
13283Replace each character in the string using the given translation table.
13284
13285The table must implement lookup/indexing via __getitem__, for instance a
13286dictionary or list. If this operation raises LookupError, the character is
13287left untouched. Characters mapped to None are deleted.
13288[clinic start generated code]*/
13289
13290static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013291unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013292/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013293{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013294 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295}
13296
INADA Naoki3ae20562017-01-16 20:41:20 +090013297/*[clinic input]
13298str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299
INADA Naoki3ae20562017-01-16 20:41:20 +090013300Return a copy of the string converted to uppercase.
13301[clinic start generated code]*/
13302
13303static PyObject *
13304unicode_upper_impl(PyObject *self)
13305/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013306{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013307 if (PyUnicode_READY(self) == -1)
13308 return NULL;
13309 if (PyUnicode_IS_ASCII(self))
13310 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013311 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013312}
13313
INADA Naoki3ae20562017-01-16 20:41:20 +090013314/*[clinic input]
13315str.zfill as unicode_zfill
13316
13317 width: Py_ssize_t
13318 /
13319
13320Pad a numeric string with zeros on the left, to fill a field of the given width.
13321
13322The string is never truncated.
13323[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013324
13325static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013326unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013327/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013328{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013329 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013330 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013331 int kind;
13332 void *data;
13333 Py_UCS4 chr;
13334
Benjamin Petersonbac79492012-01-14 13:34:47 -050013335 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013337
Victor Stinnerc4b49542011-12-11 22:44:26 +010013338 if (PyUnicode_GET_LENGTH(self) >= width)
13339 return unicode_result_unchanged(self);
13340
13341 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013342
13343 u = pad(self, fill, 0, '0');
13344
Walter Dörwald068325e2002-04-15 13:36:47 +000013345 if (u == NULL)
13346 return NULL;
13347
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013348 kind = PyUnicode_KIND(u);
13349 data = PyUnicode_DATA(u);
13350 chr = PyUnicode_READ(kind, data, fill);
13351
13352 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013353 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013354 PyUnicode_WRITE(kind, data, 0, chr);
13355 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013356 }
13357
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013358 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013359 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013361
13362#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013363static PyObject *
13364unicode__decimal2ascii(PyObject *self)
13365{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013366 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013367}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013368#endif
13369
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013370PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013371 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013373Return True if S starts with the specified prefix, False otherwise.\n\
13374With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013375With optional end, stop comparing S at that position.\n\
13376prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013377
13378static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013379unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013380 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013382 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013383 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013384 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013385 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013387
Jesus Ceaac451502011-04-20 17:09:23 +020013388 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013389 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013390 if (PyTuple_Check(subobj)) {
13391 Py_ssize_t i;
13392 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013393 substring = PyTuple_GET_ITEM(subobj, i);
13394 if (!PyUnicode_Check(substring)) {
13395 PyErr_Format(PyExc_TypeError,
13396 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013397 "not %.100s",
13398 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013399 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013400 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013402 if (result == -1)
13403 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013404 if (result) {
13405 Py_RETURN_TRUE;
13406 }
13407 }
13408 /* nothing matched */
13409 Py_RETURN_FALSE;
13410 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013411 if (!PyUnicode_Check(subobj)) {
13412 PyErr_Format(PyExc_TypeError,
13413 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013414 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013415 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013416 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013417 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013418 if (result == -1)
13419 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013420 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013421}
13422
13423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013424PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013425 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013426\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013427Return True if S ends with the specified suffix, False otherwise.\n\
13428With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013429With optional end, stop comparing S at that position.\n\
13430suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013431
13432static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013433unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013434 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013435{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013436 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013437 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013438 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013439 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013440 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441
Jesus Ceaac451502011-04-20 17:09:23 +020013442 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013443 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013444 if (PyTuple_Check(subobj)) {
13445 Py_ssize_t i;
13446 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013447 substring = PyTuple_GET_ITEM(subobj, i);
13448 if (!PyUnicode_Check(substring)) {
13449 PyErr_Format(PyExc_TypeError,
13450 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013451 "not %.100s",
13452 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013453 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013454 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013455 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013456 if (result == -1)
13457 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013458 if (result) {
13459 Py_RETURN_TRUE;
13460 }
13461 }
13462 Py_RETURN_FALSE;
13463 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013464 if (!PyUnicode_Check(subobj)) {
13465 PyErr_Format(PyExc_TypeError,
13466 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013467 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013468 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013469 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013470 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013471 if (result == -1)
13472 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013473 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013474}
13475
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013476static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013477_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013478{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013479 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13480 writer->data = PyUnicode_DATA(writer->buffer);
13481
13482 if (!writer->readonly) {
13483 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013484 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013485 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013486 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013487 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13488 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13489 writer->kind = PyUnicode_WCHAR_KIND;
13490 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13491
Victor Stinner8f674cc2013-04-17 23:02:17 +020013492 /* Copy-on-write mode: set buffer size to 0 so
13493 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13494 * next write. */
13495 writer->size = 0;
13496 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013497}
13498
Victor Stinnerd3f08822012-05-29 12:57:52 +020013499void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013500_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013501{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013502 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013503
13504 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013505 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013506
13507 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13508 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13509 writer->kind = PyUnicode_WCHAR_KIND;
13510 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013511}
13512
Victor Stinnerd3f08822012-05-29 12:57:52 +020013513int
13514_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13515 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013516{
13517 Py_ssize_t newlen;
13518 PyObject *newbuffer;
13519
Victor Stinner2740e462016-09-06 16:58:36 -070013520 assert(maxchar <= MAX_UNICODE);
13521
Victor Stinnerca9381e2015-09-22 00:58:32 +020013522 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013523 assert((maxchar > writer->maxchar && length >= 0)
13524 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013525
Victor Stinner202fdca2012-05-07 12:47:02 +020013526 if (length > PY_SSIZE_T_MAX - writer->pos) {
13527 PyErr_NoMemory();
13528 return -1;
13529 }
13530 newlen = writer->pos + length;
13531
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013532 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013533
Victor Stinnerd3f08822012-05-29 12:57:52 +020013534 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013535 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013536 if (writer->overallocate
13537 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13538 /* overallocate to limit the number of realloc() */
13539 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013540 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013541 if (newlen < writer->min_length)
13542 newlen = writer->min_length;
13543
Victor Stinnerd3f08822012-05-29 12:57:52 +020013544 writer->buffer = PyUnicode_New(newlen, maxchar);
13545 if (writer->buffer == NULL)
13546 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013547 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013548 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013549 if (writer->overallocate
13550 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13551 /* overallocate to limit the number of realloc() */
13552 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013553 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013554 if (newlen < writer->min_length)
13555 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013556
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013557 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013558 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013559 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013560 newbuffer = PyUnicode_New(newlen, maxchar);
13561 if (newbuffer == NULL)
13562 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013563 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13564 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013565 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013566 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013567 }
13568 else {
13569 newbuffer = resize_compact(writer->buffer, newlen);
13570 if (newbuffer == NULL)
13571 return -1;
13572 }
13573 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013574 }
13575 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013576 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013577 newbuffer = PyUnicode_New(writer->size, maxchar);
13578 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013579 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013580 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13581 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013582 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013583 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013584 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013585 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013586
13587#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013588}
13589
Victor Stinnerca9381e2015-09-22 00:58:32 +020013590int
13591_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13592 enum PyUnicode_Kind kind)
13593{
13594 Py_UCS4 maxchar;
13595
13596 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13597 assert(writer->kind < kind);
13598
13599 switch (kind)
13600 {
13601 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13602 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13603 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13604 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013605 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013606 }
13607
13608 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13609}
13610
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013611static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013612_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013613{
Victor Stinner2740e462016-09-06 16:58:36 -070013614 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013615 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13616 return -1;
13617 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13618 writer->pos++;
13619 return 0;
13620}
13621
13622int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013623_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13624{
13625 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13626}
13627
13628int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013629_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13630{
13631 Py_UCS4 maxchar;
13632 Py_ssize_t len;
13633
13634 if (PyUnicode_READY(str) == -1)
13635 return -1;
13636 len = PyUnicode_GET_LENGTH(str);
13637 if (len == 0)
13638 return 0;
13639 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13640 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013641 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013642 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013643 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013644 Py_INCREF(str);
13645 writer->buffer = str;
13646 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013647 writer->pos += len;
13648 return 0;
13649 }
13650 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13651 return -1;
13652 }
13653 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13654 str, 0, len);
13655 writer->pos += len;
13656 return 0;
13657}
13658
Victor Stinnere215d962012-10-06 23:03:36 +020013659int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013660_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13661 Py_ssize_t start, Py_ssize_t end)
13662{
13663 Py_UCS4 maxchar;
13664 Py_ssize_t len;
13665
13666 if (PyUnicode_READY(str) == -1)
13667 return -1;
13668
13669 assert(0 <= start);
13670 assert(end <= PyUnicode_GET_LENGTH(str));
13671 assert(start <= end);
13672
13673 if (end == 0)
13674 return 0;
13675
13676 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13677 return _PyUnicodeWriter_WriteStr(writer, str);
13678
13679 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13680 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13681 else
13682 maxchar = writer->maxchar;
13683 len = end - start;
13684
13685 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13686 return -1;
13687
13688 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13689 str, start, len);
13690 writer->pos += len;
13691 return 0;
13692}
13693
13694int
Victor Stinner4a587072013-11-19 12:54:53 +010013695_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13696 const char *ascii, Py_ssize_t len)
13697{
13698 if (len == -1)
13699 len = strlen(ascii);
13700
13701 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13702
13703 if (writer->buffer == NULL && !writer->overallocate) {
13704 PyObject *str;
13705
13706 str = _PyUnicode_FromASCII(ascii, len);
13707 if (str == NULL)
13708 return -1;
13709
13710 writer->readonly = 1;
13711 writer->buffer = str;
13712 _PyUnicodeWriter_Update(writer);
13713 writer->pos += len;
13714 return 0;
13715 }
13716
13717 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13718 return -1;
13719
13720 switch (writer->kind)
13721 {
13722 case PyUnicode_1BYTE_KIND:
13723 {
13724 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13725 Py_UCS1 *data = writer->data;
13726
Christian Heimesf051e432016-09-13 20:22:02 +020013727 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013728 break;
13729 }
13730 case PyUnicode_2BYTE_KIND:
13731 {
13732 _PyUnicode_CONVERT_BYTES(
13733 Py_UCS1, Py_UCS2,
13734 ascii, ascii + len,
13735 (Py_UCS2 *)writer->data + writer->pos);
13736 break;
13737 }
13738 case PyUnicode_4BYTE_KIND:
13739 {
13740 _PyUnicode_CONVERT_BYTES(
13741 Py_UCS1, Py_UCS4,
13742 ascii, ascii + len,
13743 (Py_UCS4 *)writer->data + writer->pos);
13744 break;
13745 }
13746 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013747 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013748 }
13749
13750 writer->pos += len;
13751 return 0;
13752}
13753
13754int
13755_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13756 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013757{
13758 Py_UCS4 maxchar;
13759
13760 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13761 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13762 return -1;
13763 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13764 writer->pos += len;
13765 return 0;
13766}
13767
Victor Stinnerd3f08822012-05-29 12:57:52 +020013768PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013769_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013770{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013771 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013772
Victor Stinnerd3f08822012-05-29 12:57:52 +020013773 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013774 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013775 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013776 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013777
13778 str = writer->buffer;
13779 writer->buffer = NULL;
13780
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013781 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013782 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13783 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013784 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013785
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013786 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13787 PyObject *str2;
13788 str2 = resize_compact(str, writer->pos);
13789 if (str2 == NULL) {
13790 Py_DECREF(str);
13791 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013792 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013793 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013794 }
13795
Victor Stinner15a0bd32013-07-08 22:29:55 +020013796 assert(_PyUnicode_CheckConsistency(str, 1));
13797 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013798}
13799
Victor Stinnerd3f08822012-05-29 12:57:52 +020013800void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013801_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013802{
13803 Py_CLEAR(writer->buffer);
13804}
13805
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013806#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013807
13808PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013809 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013810\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013811Return a formatted version of S, using substitutions from args and kwargs.\n\
13812The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013813
Eric Smith27bbca62010-11-04 17:06:58 +000013814PyDoc_STRVAR(format_map__doc__,
13815 "S.format_map(mapping) -> str\n\
13816\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013817Return a formatted version of S, using substitutions from mapping.\n\
13818The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013819
INADA Naoki3ae20562017-01-16 20:41:20 +090013820/*[clinic input]
13821str.__format__ as unicode___format__
13822
13823 format_spec: unicode
13824 /
13825
13826Return a formatted version of the string as described by format_spec.
13827[clinic start generated code]*/
13828
Eric Smith4a7d76d2008-05-30 18:10:19 +000013829static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013830unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013831/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013832{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013833 _PyUnicodeWriter writer;
13834 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013835
Victor Stinnerd3f08822012-05-29 12:57:52 +020013836 if (PyUnicode_READY(self) == -1)
13837 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013838 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013839 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13840 self, format_spec, 0,
13841 PyUnicode_GET_LENGTH(format_spec));
13842 if (ret == -1) {
13843 _PyUnicodeWriter_Dealloc(&writer);
13844 return NULL;
13845 }
13846 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013847}
13848
INADA Naoki3ae20562017-01-16 20:41:20 +090013849/*[clinic input]
13850str.__sizeof__ as unicode_sizeof
13851
13852Return the size of the string in memory, in bytes.
13853[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013854
13855static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013856unicode_sizeof_impl(PyObject *self)
13857/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013858{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013859 Py_ssize_t size;
13860
13861 /* If it's a compact object, account for base structure +
13862 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013863 if (PyUnicode_IS_COMPACT_ASCII(self))
13864 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13865 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013866 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013867 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868 else {
13869 /* If it is a two-block object, account for base object, and
13870 for character block if present. */
13871 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013872 if (_PyUnicode_DATA_ANY(self))
13873 size += (PyUnicode_GET_LENGTH(self) + 1) *
13874 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013875 }
13876 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013877 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013878 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13879 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13880 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13881 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013882
13883 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013884}
13885
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013886static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013887unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013888{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013889 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013890 if (!copy)
13891 return NULL;
13892 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013893}
13894
Guido van Rossumd57fd912000-03-10 22:53:23 +000013895static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013896 UNICODE_ENCODE_METHODDEF
13897 UNICODE_REPLACE_METHODDEF
13898 UNICODE_SPLIT_METHODDEF
13899 UNICODE_RSPLIT_METHODDEF
13900 UNICODE_JOIN_METHODDEF
13901 UNICODE_CAPITALIZE_METHODDEF
13902 UNICODE_CASEFOLD_METHODDEF
13903 UNICODE_TITLE_METHODDEF
13904 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013905 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013906 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013907 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013908 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013909 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013910 UNICODE_LJUST_METHODDEF
13911 UNICODE_LOWER_METHODDEF
13912 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013913 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13914 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013915 UNICODE_RJUST_METHODDEF
13916 UNICODE_RSTRIP_METHODDEF
13917 UNICODE_RPARTITION_METHODDEF
13918 UNICODE_SPLITLINES_METHODDEF
13919 UNICODE_STRIP_METHODDEF
13920 UNICODE_SWAPCASE_METHODDEF
13921 UNICODE_TRANSLATE_METHODDEF
13922 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013923 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13924 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013925 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013926 UNICODE_ISLOWER_METHODDEF
13927 UNICODE_ISUPPER_METHODDEF
13928 UNICODE_ISTITLE_METHODDEF
13929 UNICODE_ISSPACE_METHODDEF
13930 UNICODE_ISDECIMAL_METHODDEF
13931 UNICODE_ISDIGIT_METHODDEF
13932 UNICODE_ISNUMERIC_METHODDEF
13933 UNICODE_ISALPHA_METHODDEF
13934 UNICODE_ISALNUM_METHODDEF
13935 UNICODE_ISIDENTIFIER_METHODDEF
13936 UNICODE_ISPRINTABLE_METHODDEF
13937 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013938 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013939 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013940 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013941 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013942 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013943#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013944 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013945 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013946#endif
13947
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013948 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013949 {NULL, NULL}
13950};
13951
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013952static PyObject *
13953unicode_mod(PyObject *v, PyObject *w)
13954{
Brian Curtindfc80e32011-08-10 20:28:54 -050013955 if (!PyUnicode_Check(v))
13956 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013957 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013958}
13959
13960static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013961 0, /*nb_add*/
13962 0, /*nb_subtract*/
13963 0, /*nb_multiply*/
13964 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013965};
13966
Guido van Rossumd57fd912000-03-10 22:53:23 +000013967static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013968 (lenfunc) unicode_length, /* sq_length */
13969 PyUnicode_Concat, /* sq_concat */
13970 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13971 (ssizeargfunc) unicode_getitem, /* sq_item */
13972 0, /* sq_slice */
13973 0, /* sq_ass_item */
13974 0, /* sq_ass_slice */
13975 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013976};
13977
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013978static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013979unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013980{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013981 if (PyUnicode_READY(self) == -1)
13982 return NULL;
13983
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013984 if (PyIndex_Check(item)) {
13985 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013986 if (i == -1 && PyErr_Occurred())
13987 return NULL;
13988 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013989 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013990 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013991 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060013992 Py_ssize_t start, stop, step, slicelength, i;
13993 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013994 PyObject *result;
13995 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013996 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013997 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013998
Serhiy Storchakab879fe82017-04-08 09:53:51 +030013999 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014000 return NULL;
14001 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014002 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14003 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014004
14005 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014006 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014007 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014008 slicelength == PyUnicode_GET_LENGTH(self)) {
14009 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014010 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014011 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014012 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014013 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014014 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014015 src_kind = PyUnicode_KIND(self);
14016 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014017 if (!PyUnicode_IS_ASCII(self)) {
14018 kind_limit = kind_maxchar_limit(src_kind);
14019 max_char = 0;
14020 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14021 ch = PyUnicode_READ(src_kind, src_data, cur);
14022 if (ch > max_char) {
14023 max_char = ch;
14024 if (max_char >= kind_limit)
14025 break;
14026 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014027 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014028 }
Victor Stinner55c99112011-10-13 01:17:06 +020014029 else
14030 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014031 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014032 if (result == NULL)
14033 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014034 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014035 dest_data = PyUnicode_DATA(result);
14036
14037 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014038 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14039 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014040 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014041 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014042 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014043 } else {
14044 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14045 return NULL;
14046 }
14047}
14048
14049static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014050 (lenfunc)unicode_length, /* mp_length */
14051 (binaryfunc)unicode_subscript, /* mp_subscript */
14052 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014053};
14054
Guido van Rossumd57fd912000-03-10 22:53:23 +000014055
Guido van Rossumd57fd912000-03-10 22:53:23 +000014056/* Helpers for PyUnicode_Format() */
14057
Victor Stinnera47082312012-10-04 02:19:54 +020014058struct unicode_formatter_t {
14059 PyObject *args;
14060 int args_owned;
14061 Py_ssize_t arglen, argidx;
14062 PyObject *dict;
14063
14064 enum PyUnicode_Kind fmtkind;
14065 Py_ssize_t fmtcnt, fmtpos;
14066 void *fmtdata;
14067 PyObject *fmtstr;
14068
14069 _PyUnicodeWriter writer;
14070};
14071
14072struct unicode_format_arg_t {
14073 Py_UCS4 ch;
14074 int flags;
14075 Py_ssize_t width;
14076 int prec;
14077 int sign;
14078};
14079
Guido van Rossumd57fd912000-03-10 22:53:23 +000014080static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014081unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014082{
Victor Stinnera47082312012-10-04 02:19:54 +020014083 Py_ssize_t argidx = ctx->argidx;
14084
14085 if (argidx < ctx->arglen) {
14086 ctx->argidx++;
14087 if (ctx->arglen < 0)
14088 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014089 else
Victor Stinnera47082312012-10-04 02:19:54 +020014090 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014091 }
14092 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014093 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014094 return NULL;
14095}
14096
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014097/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098
Victor Stinnera47082312012-10-04 02:19:54 +020014099/* Format a float into the writer if the writer is not NULL, or into *p_output
14100 otherwise.
14101
14102 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014103static int
Victor Stinnera47082312012-10-04 02:19:54 +020014104formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14105 PyObject **p_output,
14106 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014107{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014108 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014109 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014110 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014111 int prec;
14112 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014113
Guido van Rossumd57fd912000-03-10 22:53:23 +000014114 x = PyFloat_AsDouble(v);
14115 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014116 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014117
Victor Stinnera47082312012-10-04 02:19:54 +020014118 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014120 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014121
Victor Stinnera47082312012-10-04 02:19:54 +020014122 if (arg->flags & F_ALT)
14123 dtoa_flags = Py_DTSF_ALT;
14124 else
14125 dtoa_flags = 0;
14126 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014127 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014128 return -1;
14129 len = strlen(p);
14130 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014131 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014132 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014133 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014134 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014135 }
14136 else
14137 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014138 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014139 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014140}
14141
Victor Stinnerd0880d52012-04-27 23:40:13 +020014142/* formatlong() emulates the format codes d, u, o, x and X, and
14143 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14144 * Python's regular ints.
14145 * Return value: a new PyUnicodeObject*, or NULL if error.
14146 * The output string is of the form
14147 * "-"? ("0x" | "0X")? digit+
14148 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14149 * set in flags. The case of hex digits will be correct,
14150 * There will be at least prec digits, zero-filled on the left if
14151 * necessary to get that many.
14152 * val object to be converted
14153 * flags bitmask of format flags; only F_ALT is looked at
14154 * prec minimum number of digits; 0-fill on left if needed
14155 * type a character in [duoxX]; u acts the same as d
14156 *
14157 * CAUTION: o, x and X conversions on regular ints can never
14158 * produce a '-' sign, but can for Python's unbounded ints.
14159 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014160PyObject *
14161_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014162{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014163 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014164 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014165 Py_ssize_t i;
14166 int sign; /* 1 if '-', else 0 */
14167 int len; /* number of characters */
14168 Py_ssize_t llen;
14169 int numdigits; /* len == numnondigits + numdigits */
14170 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014171
Victor Stinnerd0880d52012-04-27 23:40:13 +020014172 /* Avoid exceeding SSIZE_T_MAX */
14173 if (prec > INT_MAX-3) {
14174 PyErr_SetString(PyExc_OverflowError,
14175 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014176 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014177 }
14178
14179 assert(PyLong_Check(val));
14180
14181 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014182 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014183 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014184 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014185 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014186 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014187 /* int and int subclasses should print numerically when a numeric */
14188 /* format code is used (see issue18780) */
14189 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014190 break;
14191 case 'o':
14192 numnondigits = 2;
14193 result = PyNumber_ToBase(val, 8);
14194 break;
14195 case 'x':
14196 case 'X':
14197 numnondigits = 2;
14198 result = PyNumber_ToBase(val, 16);
14199 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014200 }
14201 if (!result)
14202 return NULL;
14203
14204 assert(unicode_modifiable(result));
14205 assert(PyUnicode_IS_READY(result));
14206 assert(PyUnicode_IS_ASCII(result));
14207
14208 /* To modify the string in-place, there can only be one reference. */
14209 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014210 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014211 PyErr_BadInternalCall();
14212 return NULL;
14213 }
14214 buf = PyUnicode_DATA(result);
14215 llen = PyUnicode_GET_LENGTH(result);
14216 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014217 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014218 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014219 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014220 return NULL;
14221 }
14222 len = (int)llen;
14223 sign = buf[0] == '-';
14224 numnondigits += sign;
14225 numdigits = len - numnondigits;
14226 assert(numdigits > 0);
14227
14228 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014229 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014230 (type == 'o' || type == 'x' || type == 'X'))) {
14231 assert(buf[sign] == '0');
14232 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14233 buf[sign+1] == 'o');
14234 numnondigits -= 2;
14235 buf += 2;
14236 len -= 2;
14237 if (sign)
14238 buf[0] = '-';
14239 assert(len == numnondigits + numdigits);
14240 assert(numdigits > 0);
14241 }
14242
14243 /* Fill with leading zeroes to meet minimum width. */
14244 if (prec > numdigits) {
14245 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14246 numnondigits + prec);
14247 char *b1;
14248 if (!r1) {
14249 Py_DECREF(result);
14250 return NULL;
14251 }
14252 b1 = PyBytes_AS_STRING(r1);
14253 for (i = 0; i < numnondigits; ++i)
14254 *b1++ = *buf++;
14255 for (i = 0; i < prec - numdigits; i++)
14256 *b1++ = '0';
14257 for (i = 0; i < numdigits; i++)
14258 *b1++ = *buf++;
14259 *b1 = '\0';
14260 Py_DECREF(result);
14261 result = r1;
14262 buf = PyBytes_AS_STRING(result);
14263 len = numnondigits + prec;
14264 }
14265
14266 /* Fix up case for hex conversions. */
14267 if (type == 'X') {
14268 /* Need to convert all lower case letters to upper case.
14269 and need to convert 0x to 0X (and -0x to -0X). */
14270 for (i = 0; i < len; i++)
14271 if (buf[i] >= 'a' && buf[i] <= 'x')
14272 buf[i] -= 'a'-'A';
14273 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014274 if (!PyUnicode_Check(result)
14275 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014276 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014277 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014278 Py_DECREF(result);
14279 result = unicode;
14280 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014281 else if (len != PyUnicode_GET_LENGTH(result)) {
14282 if (PyUnicode_Resize(&result, len) < 0)
14283 Py_CLEAR(result);
14284 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014285 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014286}
14287
Ethan Furmandf3ed242014-01-05 06:50:30 -080014288/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014289 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014290 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014291 * -1 and raise an exception on error */
14292static int
Victor Stinnera47082312012-10-04 02:19:54 +020014293mainformatlong(PyObject *v,
14294 struct unicode_format_arg_t *arg,
14295 PyObject **p_output,
14296 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014297{
14298 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014299 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014300
14301 if (!PyNumber_Check(v))
14302 goto wrongtype;
14303
Ethan Furman9ab74802014-03-21 06:38:46 -070014304 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014305 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014306 if (type == 'o' || type == 'x' || type == 'X') {
14307 iobj = PyNumber_Index(v);
14308 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014309 if (PyErr_ExceptionMatches(PyExc_TypeError))
14310 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014311 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014312 }
14313 }
14314 else {
14315 iobj = PyNumber_Long(v);
14316 if (iobj == NULL ) {
14317 if (PyErr_ExceptionMatches(PyExc_TypeError))
14318 goto wrongtype;
14319 return -1;
14320 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014321 }
14322 assert(PyLong_Check(iobj));
14323 }
14324 else {
14325 iobj = v;
14326 Py_INCREF(iobj);
14327 }
14328
14329 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014330 && arg->width == -1 && arg->prec == -1
14331 && !(arg->flags & (F_SIGN | F_BLANK))
14332 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014333 {
14334 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014335 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014336 int base;
14337
Victor Stinnera47082312012-10-04 02:19:54 +020014338 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014339 {
14340 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014341 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014342 case 'd':
14343 case 'i':
14344 case 'u':
14345 base = 10;
14346 break;
14347 case 'o':
14348 base = 8;
14349 break;
14350 case 'x':
14351 case 'X':
14352 base = 16;
14353 break;
14354 }
14355
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014356 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14357 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014358 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014359 }
14360 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014361 return 1;
14362 }
14363
Ethan Furmanb95b5612015-01-23 20:05:18 -080014364 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014365 Py_DECREF(iobj);
14366 if (res == NULL)
14367 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014368 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014369 return 0;
14370
14371wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014372 switch(type)
14373 {
14374 case 'o':
14375 case 'x':
14376 case 'X':
14377 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014378 "%%%c format: an integer is required, "
14379 "not %.200s",
14380 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014381 break;
14382 default:
14383 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014384 "%%%c format: a number is required, "
14385 "not %.200s",
14386 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014387 break;
14388 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014389 return -1;
14390}
14391
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014392static Py_UCS4
14393formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014394{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014395 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014396 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014397 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014398 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014399 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014400 goto onError;
14401 }
14402 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014403 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014404 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014405 /* make sure number is a type of integer */
14406 if (!PyLong_Check(v)) {
14407 iobj = PyNumber_Index(v);
14408 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014409 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014410 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014411 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014412 Py_DECREF(iobj);
14413 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014414 else {
14415 x = PyLong_AsLong(v);
14416 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014417 if (x == -1 && PyErr_Occurred())
14418 goto onError;
14419
Victor Stinner8faf8212011-12-08 22:14:11 +010014420 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014421 PyErr_SetString(PyExc_OverflowError,
14422 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014423 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014424 }
14425
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014426 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014427 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014428
Benjamin Peterson29060642009-01-31 22:14:21 +000014429 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014430 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014431 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014432 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014433}
14434
Victor Stinnera47082312012-10-04 02:19:54 +020014435/* Parse options of an argument: flags, width, precision.
14436 Handle also "%(name)" syntax.
14437
14438 Return 0 if the argument has been formatted into arg->str.
14439 Return 1 if the argument has been written into ctx->writer,
14440 Raise an exception and return -1 on error. */
14441static int
14442unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14443 struct unicode_format_arg_t *arg)
14444{
14445#define FORMAT_READ(ctx) \
14446 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14447
14448 PyObject *v;
14449
Victor Stinnera47082312012-10-04 02:19:54 +020014450 if (arg->ch == '(') {
14451 /* Get argument value from a dictionary. Example: "%(name)s". */
14452 Py_ssize_t keystart;
14453 Py_ssize_t keylen;
14454 PyObject *key;
14455 int pcount = 1;
14456
14457 if (ctx->dict == NULL) {
14458 PyErr_SetString(PyExc_TypeError,
14459 "format requires a mapping");
14460 return -1;
14461 }
14462 ++ctx->fmtpos;
14463 --ctx->fmtcnt;
14464 keystart = ctx->fmtpos;
14465 /* Skip over balanced parentheses */
14466 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14467 arg->ch = FORMAT_READ(ctx);
14468 if (arg->ch == ')')
14469 --pcount;
14470 else if (arg->ch == '(')
14471 ++pcount;
14472 ctx->fmtpos++;
14473 }
14474 keylen = ctx->fmtpos - keystart - 1;
14475 if (ctx->fmtcnt < 0 || pcount > 0) {
14476 PyErr_SetString(PyExc_ValueError,
14477 "incomplete format key");
14478 return -1;
14479 }
14480 key = PyUnicode_Substring(ctx->fmtstr,
14481 keystart, keystart + keylen);
14482 if (key == NULL)
14483 return -1;
14484 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014485 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014486 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014487 }
14488 ctx->args = PyObject_GetItem(ctx->dict, key);
14489 Py_DECREF(key);
14490 if (ctx->args == NULL)
14491 return -1;
14492 ctx->args_owned = 1;
14493 ctx->arglen = -1;
14494 ctx->argidx = -2;
14495 }
14496
14497 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014498 while (--ctx->fmtcnt >= 0) {
14499 arg->ch = FORMAT_READ(ctx);
14500 ctx->fmtpos++;
14501 switch (arg->ch) {
14502 case '-': arg->flags |= F_LJUST; continue;
14503 case '+': arg->flags |= F_SIGN; continue;
14504 case ' ': arg->flags |= F_BLANK; continue;
14505 case '#': arg->flags |= F_ALT; continue;
14506 case '0': arg->flags |= F_ZERO; continue;
14507 }
14508 break;
14509 }
14510
14511 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014512 if (arg->ch == '*') {
14513 v = unicode_format_getnextarg(ctx);
14514 if (v == NULL)
14515 return -1;
14516 if (!PyLong_Check(v)) {
14517 PyErr_SetString(PyExc_TypeError,
14518 "* wants int");
14519 return -1;
14520 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014521 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014522 if (arg->width == -1 && PyErr_Occurred())
14523 return -1;
14524 if (arg->width < 0) {
14525 arg->flags |= F_LJUST;
14526 arg->width = -arg->width;
14527 }
14528 if (--ctx->fmtcnt >= 0) {
14529 arg->ch = FORMAT_READ(ctx);
14530 ctx->fmtpos++;
14531 }
14532 }
14533 else if (arg->ch >= '0' && arg->ch <= '9') {
14534 arg->width = arg->ch - '0';
14535 while (--ctx->fmtcnt >= 0) {
14536 arg->ch = FORMAT_READ(ctx);
14537 ctx->fmtpos++;
14538 if (arg->ch < '0' || arg->ch > '9')
14539 break;
14540 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14541 mixing signed and unsigned comparison. Since arg->ch is between
14542 '0' and '9', casting to int is safe. */
14543 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14544 PyErr_SetString(PyExc_ValueError,
14545 "width too big");
14546 return -1;
14547 }
14548 arg->width = arg->width*10 + (arg->ch - '0');
14549 }
14550 }
14551
14552 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014553 if (arg->ch == '.') {
14554 arg->prec = 0;
14555 if (--ctx->fmtcnt >= 0) {
14556 arg->ch = FORMAT_READ(ctx);
14557 ctx->fmtpos++;
14558 }
14559 if (arg->ch == '*') {
14560 v = unicode_format_getnextarg(ctx);
14561 if (v == NULL)
14562 return -1;
14563 if (!PyLong_Check(v)) {
14564 PyErr_SetString(PyExc_TypeError,
14565 "* wants int");
14566 return -1;
14567 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014568 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014569 if (arg->prec == -1 && PyErr_Occurred())
14570 return -1;
14571 if (arg->prec < 0)
14572 arg->prec = 0;
14573 if (--ctx->fmtcnt >= 0) {
14574 arg->ch = FORMAT_READ(ctx);
14575 ctx->fmtpos++;
14576 }
14577 }
14578 else if (arg->ch >= '0' && arg->ch <= '9') {
14579 arg->prec = arg->ch - '0';
14580 while (--ctx->fmtcnt >= 0) {
14581 arg->ch = FORMAT_READ(ctx);
14582 ctx->fmtpos++;
14583 if (arg->ch < '0' || arg->ch > '9')
14584 break;
14585 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14586 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014587 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014588 return -1;
14589 }
14590 arg->prec = arg->prec*10 + (arg->ch - '0');
14591 }
14592 }
14593 }
14594
14595 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14596 if (ctx->fmtcnt >= 0) {
14597 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14598 if (--ctx->fmtcnt >= 0) {
14599 arg->ch = FORMAT_READ(ctx);
14600 ctx->fmtpos++;
14601 }
14602 }
14603 }
14604 if (ctx->fmtcnt < 0) {
14605 PyErr_SetString(PyExc_ValueError,
14606 "incomplete format");
14607 return -1;
14608 }
14609 return 0;
14610
14611#undef FORMAT_READ
14612}
14613
14614/* Format one argument. Supported conversion specifiers:
14615
14616 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014617 - "i", "d", "u": int or float
14618 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014619 - "e", "E", "f", "F", "g", "G": float
14620 - "c": int or str (1 character)
14621
Victor Stinner8dbd4212012-12-04 09:30:24 +010014622 When possible, the output is written directly into the Unicode writer
14623 (ctx->writer). A string is created when padding is required.
14624
Victor Stinnera47082312012-10-04 02:19:54 +020014625 Return 0 if the argument has been formatted into *p_str,
14626 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014627 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014628static int
14629unicode_format_arg_format(struct unicode_formatter_t *ctx,
14630 struct unicode_format_arg_t *arg,
14631 PyObject **p_str)
14632{
14633 PyObject *v;
14634 _PyUnicodeWriter *writer = &ctx->writer;
14635
14636 if (ctx->fmtcnt == 0)
14637 ctx->writer.overallocate = 0;
14638
Victor Stinnera47082312012-10-04 02:19:54 +020014639 v = unicode_format_getnextarg(ctx);
14640 if (v == NULL)
14641 return -1;
14642
Victor Stinnera47082312012-10-04 02:19:54 +020014643
14644 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014645 case 's':
14646 case 'r':
14647 case 'a':
14648 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14649 /* Fast path */
14650 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14651 return -1;
14652 return 1;
14653 }
14654
14655 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14656 *p_str = v;
14657 Py_INCREF(*p_str);
14658 }
14659 else {
14660 if (arg->ch == 's')
14661 *p_str = PyObject_Str(v);
14662 else if (arg->ch == 'r')
14663 *p_str = PyObject_Repr(v);
14664 else
14665 *p_str = PyObject_ASCII(v);
14666 }
14667 break;
14668
14669 case 'i':
14670 case 'd':
14671 case 'u':
14672 case 'o':
14673 case 'x':
14674 case 'X':
14675 {
14676 int ret = mainformatlong(v, arg, p_str, writer);
14677 if (ret != 0)
14678 return ret;
14679 arg->sign = 1;
14680 break;
14681 }
14682
14683 case 'e':
14684 case 'E':
14685 case 'f':
14686 case 'F':
14687 case 'g':
14688 case 'G':
14689 if (arg->width == -1 && arg->prec == -1
14690 && !(arg->flags & (F_SIGN | F_BLANK)))
14691 {
14692 /* Fast path */
14693 if (formatfloat(v, arg, NULL, writer) == -1)
14694 return -1;
14695 return 1;
14696 }
14697
14698 arg->sign = 1;
14699 if (formatfloat(v, arg, p_str, NULL) == -1)
14700 return -1;
14701 break;
14702
14703 case 'c':
14704 {
14705 Py_UCS4 ch = formatchar(v);
14706 if (ch == (Py_UCS4) -1)
14707 return -1;
14708 if (arg->width == -1 && arg->prec == -1) {
14709 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014710 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014711 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014712 return 1;
14713 }
14714 *p_str = PyUnicode_FromOrdinal(ch);
14715 break;
14716 }
14717
14718 default:
14719 PyErr_Format(PyExc_ValueError,
14720 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014721 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014722 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14723 (int)arg->ch,
14724 ctx->fmtpos - 1);
14725 return -1;
14726 }
14727 if (*p_str == NULL)
14728 return -1;
14729 assert (PyUnicode_Check(*p_str));
14730 return 0;
14731}
14732
14733static int
14734unicode_format_arg_output(struct unicode_formatter_t *ctx,
14735 struct unicode_format_arg_t *arg,
14736 PyObject *str)
14737{
14738 Py_ssize_t len;
14739 enum PyUnicode_Kind kind;
14740 void *pbuf;
14741 Py_ssize_t pindex;
14742 Py_UCS4 signchar;
14743 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014744 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014745 Py_ssize_t sublen;
14746 _PyUnicodeWriter *writer = &ctx->writer;
14747 Py_UCS4 fill;
14748
14749 fill = ' ';
14750 if (arg->sign && arg->flags & F_ZERO)
14751 fill = '0';
14752
14753 if (PyUnicode_READY(str) == -1)
14754 return -1;
14755
14756 len = PyUnicode_GET_LENGTH(str);
14757 if ((arg->width == -1 || arg->width <= len)
14758 && (arg->prec == -1 || arg->prec >= len)
14759 && !(arg->flags & (F_SIGN | F_BLANK)))
14760 {
14761 /* Fast path */
14762 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14763 return -1;
14764 return 0;
14765 }
14766
14767 /* Truncate the string for "s", "r" and "a" formats
14768 if the precision is set */
14769 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14770 if (arg->prec >= 0 && len > arg->prec)
14771 len = arg->prec;
14772 }
14773
14774 /* Adjust sign and width */
14775 kind = PyUnicode_KIND(str);
14776 pbuf = PyUnicode_DATA(str);
14777 pindex = 0;
14778 signchar = '\0';
14779 if (arg->sign) {
14780 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14781 if (ch == '-' || ch == '+') {
14782 signchar = ch;
14783 len--;
14784 pindex++;
14785 }
14786 else if (arg->flags & F_SIGN)
14787 signchar = '+';
14788 else if (arg->flags & F_BLANK)
14789 signchar = ' ';
14790 else
14791 arg->sign = 0;
14792 }
14793 if (arg->width < len)
14794 arg->width = len;
14795
14796 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014797 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014798 if (!(arg->flags & F_LJUST)) {
14799 if (arg->sign) {
14800 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014801 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014802 }
14803 else {
14804 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014805 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014806 }
14807 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014808 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14809 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014810 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014811 }
14812
Victor Stinnera47082312012-10-04 02:19:54 +020014813 buflen = arg->width;
14814 if (arg->sign && len == arg->width)
14815 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014816 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014817 return -1;
14818
14819 /* Write the sign if needed */
14820 if (arg->sign) {
14821 if (fill != ' ') {
14822 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14823 writer->pos += 1;
14824 }
14825 if (arg->width > len)
14826 arg->width--;
14827 }
14828
14829 /* Write the numeric prefix for "x", "X" and "o" formats
14830 if the alternate form is used.
14831 For example, write "0x" for the "%#x" format. */
14832 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14833 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14834 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14835 if (fill != ' ') {
14836 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14837 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14838 writer->pos += 2;
14839 pindex += 2;
14840 }
14841 arg->width -= 2;
14842 if (arg->width < 0)
14843 arg->width = 0;
14844 len -= 2;
14845 }
14846
14847 /* Pad left with the fill character if needed */
14848 if (arg->width > len && !(arg->flags & F_LJUST)) {
14849 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014850 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014851 writer->pos += sublen;
14852 arg->width = len;
14853 }
14854
14855 /* If padding with spaces: write sign if needed and/or numeric prefix if
14856 the alternate form is used */
14857 if (fill == ' ') {
14858 if (arg->sign) {
14859 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14860 writer->pos += 1;
14861 }
14862 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14863 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14864 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14865 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14866 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14867 writer->pos += 2;
14868 pindex += 2;
14869 }
14870 }
14871
14872 /* Write characters */
14873 if (len) {
14874 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14875 str, pindex, len);
14876 writer->pos += len;
14877 }
14878
14879 /* Pad right with the fill character if needed */
14880 if (arg->width > len) {
14881 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014882 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014883 writer->pos += sublen;
14884 }
14885 return 0;
14886}
14887
14888/* Helper of PyUnicode_Format(): format one arg.
14889 Return 0 on success, raise an exception and return -1 on error. */
14890static int
14891unicode_format_arg(struct unicode_formatter_t *ctx)
14892{
14893 struct unicode_format_arg_t arg;
14894 PyObject *str;
14895 int ret;
14896
Victor Stinner8dbd4212012-12-04 09:30:24 +010014897 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014898 if (arg.ch == '%') {
14899 ctx->fmtpos++;
14900 ctx->fmtcnt--;
14901 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14902 return -1;
14903 return 0;
14904 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014905 arg.flags = 0;
14906 arg.width = -1;
14907 arg.prec = -1;
14908 arg.sign = 0;
14909 str = NULL;
14910
Victor Stinnera47082312012-10-04 02:19:54 +020014911 ret = unicode_format_arg_parse(ctx, &arg);
14912 if (ret == -1)
14913 return -1;
14914
14915 ret = unicode_format_arg_format(ctx, &arg, &str);
14916 if (ret == -1)
14917 return -1;
14918
14919 if (ret != 1) {
14920 ret = unicode_format_arg_output(ctx, &arg, str);
14921 Py_DECREF(str);
14922 if (ret == -1)
14923 return -1;
14924 }
14925
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014926 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014927 PyErr_SetString(PyExc_TypeError,
14928 "not all arguments converted during string formatting");
14929 return -1;
14930 }
14931 return 0;
14932}
14933
Alexander Belopolsky40018472011-02-26 01:02:56 +000014934PyObject *
14935PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014936{
Victor Stinnera47082312012-10-04 02:19:54 +020014937 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014938
Guido van Rossumd57fd912000-03-10 22:53:23 +000014939 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014940 PyErr_BadInternalCall();
14941 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014942 }
Victor Stinnera47082312012-10-04 02:19:54 +020014943
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014944 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014945 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014946
14947 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014948 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14949 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14950 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14951 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014952
Victor Stinner8f674cc2013-04-17 23:02:17 +020014953 _PyUnicodeWriter_Init(&ctx.writer);
14954 ctx.writer.min_length = ctx.fmtcnt + 100;
14955 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014956
Guido van Rossumd57fd912000-03-10 22:53:23 +000014957 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014958 ctx.arglen = PyTuple_Size(args);
14959 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014960 }
14961 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014962 ctx.arglen = -1;
14963 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014964 }
Victor Stinnera47082312012-10-04 02:19:54 +020014965 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014966 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014967 ctx.dict = args;
14968 else
14969 ctx.dict = NULL;
14970 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014971
Victor Stinnera47082312012-10-04 02:19:54 +020014972 while (--ctx.fmtcnt >= 0) {
14973 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014974 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014975
14976 nonfmtpos = ctx.fmtpos++;
14977 while (ctx.fmtcnt >= 0 &&
14978 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14979 ctx.fmtpos++;
14980 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014981 }
Victor Stinnera47082312012-10-04 02:19:54 +020014982 if (ctx.fmtcnt < 0) {
14983 ctx.fmtpos--;
14984 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014985 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014986
Victor Stinnercfc4c132013-04-03 01:48:39 +020014987 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14988 nonfmtpos, ctx.fmtpos) < 0)
14989 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014990 }
14991 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014992 ctx.fmtpos++;
14993 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014994 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014995 }
14996 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014997
Victor Stinnera47082312012-10-04 02:19:54 +020014998 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014999 PyErr_SetString(PyExc_TypeError,
15000 "not all arguments converted during string formatting");
15001 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015002 }
15003
Victor Stinnera47082312012-10-04 02:19:54 +020015004 if (ctx.args_owned) {
15005 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015006 }
Victor Stinnera47082312012-10-04 02:19:54 +020015007 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015008
Benjamin Peterson29060642009-01-31 22:14:21 +000015009 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015010 _PyUnicodeWriter_Dealloc(&ctx.writer);
15011 if (ctx.args_owned) {
15012 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015013 }
15014 return NULL;
15015}
15016
Jeremy Hylton938ace62002-07-17 16:30:39 +000015017static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015018unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15019
Tim Peters6d6c1a32001-08-02 04:15:00 +000015020static PyObject *
15021unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15022{
Benjamin Peterson29060642009-01-31 22:14:21 +000015023 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015024 static char *kwlist[] = {"object", "encoding", "errors", 0};
15025 char *encoding = NULL;
15026 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015027
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 if (type != &PyUnicode_Type)
15029 return unicode_subtype_new(type, args, kwds);
15030 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015031 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 return NULL;
15033 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015034 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015035 if (encoding == NULL && errors == NULL)
15036 return PyObject_Str(x);
15037 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015038 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015039}
15040
Guido van Rossume023fe02001-08-30 03:12:59 +000015041static PyObject *
15042unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15043{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015044 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015045 Py_ssize_t length, char_size;
15046 int share_wstr, share_utf8;
15047 unsigned int kind;
15048 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015049
Benjamin Peterson14339b62009-01-31 16:36:08 +000015050 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015051
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015052 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015053 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015055 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015056 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015057 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015058 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015059 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015060
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015061 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 if (self == NULL) {
15063 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015064 return NULL;
15065 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015066 kind = PyUnicode_KIND(unicode);
15067 length = PyUnicode_GET_LENGTH(unicode);
15068
15069 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015070#ifdef Py_DEBUG
15071 _PyUnicode_HASH(self) = -1;
15072#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015073 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015074#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 _PyUnicode_STATE(self).interned = 0;
15076 _PyUnicode_STATE(self).kind = kind;
15077 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015078 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015079 _PyUnicode_STATE(self).ready = 1;
15080 _PyUnicode_WSTR(self) = NULL;
15081 _PyUnicode_UTF8_LENGTH(self) = 0;
15082 _PyUnicode_UTF8(self) = NULL;
15083 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015084 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015085
15086 share_utf8 = 0;
15087 share_wstr = 0;
15088 if (kind == PyUnicode_1BYTE_KIND) {
15089 char_size = 1;
15090 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15091 share_utf8 = 1;
15092 }
15093 else if (kind == PyUnicode_2BYTE_KIND) {
15094 char_size = 2;
15095 if (sizeof(wchar_t) == 2)
15096 share_wstr = 1;
15097 }
15098 else {
15099 assert(kind == PyUnicode_4BYTE_KIND);
15100 char_size = 4;
15101 if (sizeof(wchar_t) == 4)
15102 share_wstr = 1;
15103 }
15104
15105 /* Ensure we won't overflow the length. */
15106 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15107 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015108 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015109 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015110 data = PyObject_MALLOC((length + 1) * char_size);
15111 if (data == NULL) {
15112 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015113 goto onError;
15114 }
15115
Victor Stinnerc3c74152011-10-02 20:39:55 +020015116 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015117 if (share_utf8) {
15118 _PyUnicode_UTF8_LENGTH(self) = length;
15119 _PyUnicode_UTF8(self) = data;
15120 }
15121 if (share_wstr) {
15122 _PyUnicode_WSTR_LENGTH(self) = length;
15123 _PyUnicode_WSTR(self) = (wchar_t *)data;
15124 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015125
Christian Heimesf051e432016-09-13 20:22:02 +020015126 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015127 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015128 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015129#ifdef Py_DEBUG
15130 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15131#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015132 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015133 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015134
15135onError:
15136 Py_DECREF(unicode);
15137 Py_DECREF(self);
15138 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015139}
15140
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015141PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015142"str(object='') -> str\n\
15143str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015144\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015145Create a new string object from the given object. If encoding or\n\
15146errors is specified, then the object must expose a data buffer\n\
15147that will be decoded using the given encoding and error handler.\n\
15148Otherwise, returns the result of object.__str__() (if defined)\n\
15149or repr(object).\n\
15150encoding defaults to sys.getdefaultencoding().\n\
15151errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015152
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015153static PyObject *unicode_iter(PyObject *seq);
15154
Guido van Rossumd57fd912000-03-10 22:53:23 +000015155PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015156 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015157 "str", /* tp_name */
15158 sizeof(PyUnicodeObject), /* tp_basicsize */
15159 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015160 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015161 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015162 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015163 0, /* tp_getattr */
15164 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015165 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015166 unicode_repr, /* tp_repr */
15167 &unicode_as_number, /* tp_as_number */
15168 &unicode_as_sequence, /* tp_as_sequence */
15169 &unicode_as_mapping, /* tp_as_mapping */
15170 (hashfunc) unicode_hash, /* tp_hash*/
15171 0, /* tp_call*/
15172 (reprfunc) unicode_str, /* tp_str */
15173 PyObject_GenericGetAttr, /* tp_getattro */
15174 0, /* tp_setattro */
15175 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015176 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015177 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15178 unicode_doc, /* tp_doc */
15179 0, /* tp_traverse */
15180 0, /* tp_clear */
15181 PyUnicode_RichCompare, /* tp_richcompare */
15182 0, /* tp_weaklistoffset */
15183 unicode_iter, /* tp_iter */
15184 0, /* tp_iternext */
15185 unicode_methods, /* tp_methods */
15186 0, /* tp_members */
15187 0, /* tp_getset */
15188 &PyBaseObject_Type, /* tp_base */
15189 0, /* tp_dict */
15190 0, /* tp_descr_get */
15191 0, /* tp_descr_set */
15192 0, /* tp_dictoffset */
15193 0, /* tp_init */
15194 0, /* tp_alloc */
15195 unicode_new, /* tp_new */
15196 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015197};
15198
15199/* Initialize the Unicode implementation */
15200
Victor Stinner331a6a52019-05-27 16:39:22 +020015201PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015202_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015203{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015204 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015205 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015206 0x000A, /* LINE FEED */
15207 0x000D, /* CARRIAGE RETURN */
15208 0x001C, /* FILE SEPARATOR */
15209 0x001D, /* GROUP SEPARATOR */
15210 0x001E, /* RECORD SEPARATOR */
15211 0x0085, /* NEXT LINE */
15212 0x2028, /* LINE SEPARATOR */
15213 0x2029, /* PARAGRAPH SEPARATOR */
15214 };
15215
Fred Drakee4315f52000-05-09 19:53:39 +000015216 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015217 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015218 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015219 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015220 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015221 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015222
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015223 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015224 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015225 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015226
15227 /* initialize the linebreak bloom filter */
15228 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015229 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015230 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015231
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015232 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015233 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015234 }
15235 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015236 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015237 }
15238 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015239 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015240 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015241 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015242}
15243
15244/* Finalize the Unicode implementation */
15245
Christian Heimesa156e092008-02-16 07:38:31 +000015246int
15247PyUnicode_ClearFreeList(void)
15248{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015249 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015250}
15251
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015252
Walter Dörwald16807132007-05-25 13:52:07 +000015253void
15254PyUnicode_InternInPlace(PyObject **p)
15255{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015256 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015257 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015258#ifdef Py_DEBUG
15259 assert(s != NULL);
15260 assert(_PyUnicode_CHECK(s));
15261#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015262 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015263 return;
15264#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015265 /* If it's a subclass, we don't really know what putting
15266 it in the interned dict might do. */
15267 if (!PyUnicode_CheckExact(s))
15268 return;
15269 if (PyUnicode_CHECK_INTERNED(s))
15270 return;
15271 if (interned == NULL) {
15272 interned = PyDict_New();
15273 if (interned == NULL) {
15274 PyErr_Clear(); /* Don't leave an exception */
15275 return;
15276 }
15277 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015278 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015279 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015280 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015281 if (t == NULL) {
15282 PyErr_Clear();
15283 return;
15284 }
15285 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015286 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015287 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015288 return;
15289 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015290 /* The two references in interned are not counted by refcnt.
15291 The deallocator will take care of this */
15292 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015293 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015294}
15295
15296void
15297PyUnicode_InternImmortal(PyObject **p)
15298{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015299 PyUnicode_InternInPlace(p);
15300 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015301 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015302 Py_INCREF(*p);
15303 }
Walter Dörwald16807132007-05-25 13:52:07 +000015304}
15305
15306PyObject *
15307PyUnicode_InternFromString(const char *cp)
15308{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015309 PyObject *s = PyUnicode_FromString(cp);
15310 if (s == NULL)
15311 return NULL;
15312 PyUnicode_InternInPlace(&s);
15313 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015314}
15315
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015316
15317#if defined(WITH_VALGRIND) || defined(__INSURE__)
15318static void
15319unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015320{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015321 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015322 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 Py_ssize_t i, n;
15324 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015325
Benjamin Peterson14339b62009-01-31 16:36:08 +000015326 if (interned == NULL || !PyDict_Check(interned))
15327 return;
15328 keys = PyDict_Keys(interned);
15329 if (keys == NULL || !PyList_Check(keys)) {
15330 PyErr_Clear();
15331 return;
15332 }
Walter Dörwald16807132007-05-25 13:52:07 +000015333
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015334 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015335 detector, interned unicode strings are not forcibly deallocated;
15336 rather, we give them their stolen references back, and then clear
15337 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015338
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015340#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015341 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015342 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015343#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015344 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015345 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015346 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015347 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015348 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015349 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015350 case SSTATE_NOT_INTERNED:
15351 /* XXX Shouldn't happen */
15352 break;
15353 case SSTATE_INTERNED_IMMORTAL:
15354 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015355 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015356 break;
15357 case SSTATE_INTERNED_MORTAL:
15358 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015359 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015360 break;
15361 default:
15362 Py_FatalError("Inconsistent interned string state.");
15363 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015364 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015365 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015366#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 fprintf(stderr, "total size of all interned strings: "
15368 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15369 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015370#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015371 Py_DECREF(keys);
15372 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015373 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015374}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015375#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015376
15377
15378/********************* Unicode Iterator **************************/
15379
15380typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015381 PyObject_HEAD
15382 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015383 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015384} unicodeiterobject;
15385
15386static void
15387unicodeiter_dealloc(unicodeiterobject *it)
15388{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015389 _PyObject_GC_UNTRACK(it);
15390 Py_XDECREF(it->it_seq);
15391 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015392}
15393
15394static int
15395unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15396{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015397 Py_VISIT(it->it_seq);
15398 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015399}
15400
15401static PyObject *
15402unicodeiter_next(unicodeiterobject *it)
15403{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015404 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015405
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 assert(it != NULL);
15407 seq = it->it_seq;
15408 if (seq == NULL)
15409 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015410 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015411
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015412 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15413 int kind = PyUnicode_KIND(seq);
15414 void *data = PyUnicode_DATA(seq);
15415 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15416 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015417 if (item != NULL)
15418 ++it->it_index;
15419 return item;
15420 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015421
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015423 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015424 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015425}
15426
15427static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015428unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015429{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015430 Py_ssize_t len = 0;
15431 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015432 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015433 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015434}
15435
15436PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15437
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015438static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015439unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015440{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015441 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015442 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015443 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015444 it->it_seq, it->it_index);
15445 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015446 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015447 if (u == NULL)
15448 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015449 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015450 }
15451}
15452
15453PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15454
15455static PyObject *
15456unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15457{
15458 Py_ssize_t index = PyLong_AsSsize_t(state);
15459 if (index == -1 && PyErr_Occurred())
15460 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015461 if (it->it_seq != NULL) {
15462 if (index < 0)
15463 index = 0;
15464 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15465 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15466 it->it_index = index;
15467 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015468 Py_RETURN_NONE;
15469}
15470
15471PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15472
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015473static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015474 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015475 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015476 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15477 reduce_doc},
15478 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15479 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015480 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015481};
15482
15483PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015484 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15485 "str_iterator", /* tp_name */
15486 sizeof(unicodeiterobject), /* tp_basicsize */
15487 0, /* tp_itemsize */
15488 /* methods */
15489 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015490 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015491 0, /* tp_getattr */
15492 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015493 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015494 0, /* tp_repr */
15495 0, /* tp_as_number */
15496 0, /* tp_as_sequence */
15497 0, /* tp_as_mapping */
15498 0, /* tp_hash */
15499 0, /* tp_call */
15500 0, /* tp_str */
15501 PyObject_GenericGetAttr, /* tp_getattro */
15502 0, /* tp_setattro */
15503 0, /* tp_as_buffer */
15504 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15505 0, /* tp_doc */
15506 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15507 0, /* tp_clear */
15508 0, /* tp_richcompare */
15509 0, /* tp_weaklistoffset */
15510 PyObject_SelfIter, /* tp_iter */
15511 (iternextfunc)unicodeiter_next, /* tp_iternext */
15512 unicodeiter_methods, /* tp_methods */
15513 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015514};
15515
15516static PyObject *
15517unicode_iter(PyObject *seq)
15518{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015519 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015520
Benjamin Peterson14339b62009-01-31 16:36:08 +000015521 if (!PyUnicode_Check(seq)) {
15522 PyErr_BadInternalCall();
15523 return NULL;
15524 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015525 if (PyUnicode_READY(seq) == -1)
15526 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015527 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15528 if (it == NULL)
15529 return NULL;
15530 it->it_index = 0;
15531 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015532 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015533 _PyObject_GC_TRACK(it);
15534 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015535}
15536
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015537
15538size_t
15539Py_UNICODE_strlen(const Py_UNICODE *u)
15540{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015541 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015542}
15543
15544Py_UNICODE*
15545Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15546{
15547 Py_UNICODE *u = s1;
15548 while ((*u++ = *s2++));
15549 return s1;
15550}
15551
15552Py_UNICODE*
15553Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15554{
15555 Py_UNICODE *u = s1;
15556 while ((*u++ = *s2++))
15557 if (n-- == 0)
15558 break;
15559 return s1;
15560}
15561
15562Py_UNICODE*
15563Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15564{
15565 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015566 u1 += wcslen(u1);
15567 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015568 return s1;
15569}
15570
15571int
15572Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15573{
15574 while (*s1 && *s2 && *s1 == *s2)
15575 s1++, s2++;
15576 if (*s1 && *s2)
15577 return (*s1 < *s2) ? -1 : +1;
15578 if (*s1)
15579 return 1;
15580 if (*s2)
15581 return -1;
15582 return 0;
15583}
15584
15585int
15586Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15587{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015588 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015589 for (; n != 0; n--) {
15590 u1 = *s1;
15591 u2 = *s2;
15592 if (u1 != u2)
15593 return (u1 < u2) ? -1 : +1;
15594 if (u1 == '\0')
15595 return 0;
15596 s1++;
15597 s2++;
15598 }
15599 return 0;
15600}
15601
15602Py_UNICODE*
15603Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15604{
15605 const Py_UNICODE *p;
15606 for (p = s; *p; p++)
15607 if (*p == c)
15608 return (Py_UNICODE*)p;
15609 return NULL;
15610}
15611
15612Py_UNICODE*
15613Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15614{
15615 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015616 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015617 while (p != s) {
15618 p--;
15619 if (*p == c)
15620 return (Py_UNICODE*)p;
15621 }
15622 return NULL;
15623}
Victor Stinner331ea922010-08-10 16:37:20 +000015624
Victor Stinner71133ff2010-09-01 23:43:53 +000015625Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015626PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015627{
Victor Stinner577db2c2011-10-11 22:12:48 +020015628 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015629 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015630
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015631 if (!PyUnicode_Check(unicode)) {
15632 PyErr_BadArgument();
15633 return NULL;
15634 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015635 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015636 if (u == NULL)
15637 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015638 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015639 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015640 PyErr_NoMemory();
15641 return NULL;
15642 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015643 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015644 size *= sizeof(Py_UNICODE);
15645 copy = PyMem_Malloc(size);
15646 if (copy == NULL) {
15647 PyErr_NoMemory();
15648 return NULL;
15649 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015650 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015651 return copy;
15652}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015653
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015654
Victor Stinner709d23d2019-05-02 14:56:30 -040015655static int
15656encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015657{
Victor Stinner709d23d2019-05-02 14:56:30 -040015658 int res;
15659 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15660 if (res == -2) {
15661 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15662 return -1;
15663 }
15664 if (res < 0) {
15665 PyErr_NoMemory();
15666 return -1;
15667 }
15668 return 0;
15669}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015670
Victor Stinner709d23d2019-05-02 14:56:30 -040015671
15672static int
15673config_get_codec_name(wchar_t **config_encoding)
15674{
15675 char *encoding;
15676 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15677 return -1;
15678 }
15679
15680 PyObject *name_obj = NULL;
15681 PyObject *codec = _PyCodec_Lookup(encoding);
15682 PyMem_RawFree(encoding);
15683
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015684 if (!codec)
15685 goto error;
15686
15687 name_obj = PyObject_GetAttrString(codec, "name");
15688 Py_CLEAR(codec);
15689 if (!name_obj) {
15690 goto error;
15691 }
15692
Victor Stinner709d23d2019-05-02 14:56:30 -040015693 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15694 Py_DECREF(name_obj);
15695 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015696 goto error;
15697 }
15698
Victor Stinner709d23d2019-05-02 14:56:30 -040015699 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15700 if (raw_wname == NULL) {
15701 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015702 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015703 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015704 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015705
15706 PyMem_RawFree(*config_encoding);
15707 *config_encoding = raw_wname;
15708
15709 PyMem_Free(wname);
15710 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015711
15712error:
15713 Py_XDECREF(codec);
15714 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015715 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015716}
15717
15718
Victor Stinner331a6a52019-05-27 16:39:22 +020015719static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015720init_stdio_encoding(PyInterpreterState *interp)
15721{
Victor Stinner709d23d2019-05-02 14:56:30 -040015722 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015723 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015724 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015725 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015726 "of the stdio encoding");
15727 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015728 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015729}
15730
15731
Victor Stinner709d23d2019-05-02 14:56:30 -040015732static int
15733init_fs_codec(PyInterpreterState *interp)
15734{
Victor Stinner331a6a52019-05-27 16:39:22 +020015735 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015736
15737 _Py_error_handler error_handler;
15738 error_handler = get_error_handler_wide(config->filesystem_errors);
15739 if (error_handler == _Py_ERROR_UNKNOWN) {
15740 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15741 return -1;
15742 }
15743
15744 char *encoding, *errors;
15745 if (encode_wstr_utf8(config->filesystem_encoding,
15746 &encoding,
15747 "filesystem_encoding") < 0) {
15748 return -1;
15749 }
15750
15751 if (encode_wstr_utf8(config->filesystem_errors,
15752 &errors,
15753 "filesystem_errors") < 0) {
15754 PyMem_RawFree(encoding);
15755 return -1;
15756 }
15757
15758 PyMem_RawFree(interp->fs_codec.encoding);
15759 interp->fs_codec.encoding = encoding;
15760 PyMem_RawFree(interp->fs_codec.errors);
15761 interp->fs_codec.errors = errors;
15762 interp->fs_codec.error_handler = error_handler;
15763
15764 /* At this point, PyUnicode_EncodeFSDefault() and
15765 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15766 the C implementation of the filesystem encoding. */
15767
15768 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15769 global configuration variables. */
15770 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15771 interp->fs_codec.errors) < 0) {
15772 PyErr_NoMemory();
15773 return -1;
15774 }
15775 return 0;
15776}
15777
15778
Victor Stinner331a6a52019-05-27 16:39:22 +020015779static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015780init_fs_encoding(PyInterpreterState *interp)
15781{
Victor Stinner709d23d2019-05-02 14:56:30 -040015782 /* Update the filesystem encoding to the normalized Python codec name.
15783 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15784 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015785 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015786 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015787 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015788 "of the filesystem encoding");
15789 }
15790
Victor Stinner709d23d2019-05-02 14:56:30 -040015791 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015792 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015793 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015794 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015795}
15796
15797
Victor Stinner331a6a52019-05-27 16:39:22 +020015798PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015799_PyUnicode_InitEncodings(PyInterpreterState *interp)
15800{
Victor Stinner331a6a52019-05-27 16:39:22 +020015801 PyStatus status = init_fs_encoding(interp);
15802 if (_PyStatus_EXCEPTION(status)) {
15803 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015804 }
15805
15806 return init_stdio_encoding(interp);
15807}
15808
15809
Victor Stinner709d23d2019-05-02 14:56:30 -040015810#ifdef MS_WINDOWS
15811int
15812_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15813{
15814 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015815 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015816
15817 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15818 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15819 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15820 if (encoding == NULL || errors == NULL) {
15821 PyMem_RawFree(encoding);
15822 PyMem_RawFree(errors);
15823 PyErr_NoMemory();
15824 return -1;
15825 }
15826
15827 PyMem_RawFree(config->filesystem_encoding);
15828 config->filesystem_encoding = encoding;
15829 PyMem_RawFree(config->filesystem_errors);
15830 config->filesystem_errors = errors;
15831
15832 return init_fs_codec(interp);
15833}
15834#endif
15835
15836
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015837void
15838_PyUnicode_Fini(void)
15839{
15840#if defined(WITH_VALGRIND) || defined(__INSURE__)
15841 /* Insure++ is a memory analysis tool that aids in discovering
15842 * memory leaks and other memory problems. On Python exit, the
15843 * interned string dictionaries are flagged as being in use at exit
15844 * (which it is). Under normal circumstances, this is fine because
15845 * the memory will be automatically reclaimed by the system. Under
15846 * memory debugging, it's a huge source of useless noise, so we
15847 * trade off slower shutdown for less distraction in the memory
15848 * reports. -baw
15849 */
15850 unicode_release_interned();
15851#endif /* __INSURE__ */
15852
15853 Py_CLEAR(unicode_empty);
15854
15855 for (Py_ssize_t i = 0; i < 256; i++) {
15856 Py_CLEAR(unicode_latin1[i]);
15857 }
15858 _PyUnicode_ClearStaticStrings();
15859 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015860
15861 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15862 PyMem_RawFree(interp->fs_codec.encoding);
15863 interp->fs_codec.encoding = NULL;
15864 PyMem_RawFree(interp->fs_codec.errors);
15865 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015866}
15867
15868
Georg Brandl66c221e2010-10-14 07:04:07 +000015869/* A _string module, to export formatter_parser and formatter_field_name_split
15870 to the string.Formatter class implemented in Python. */
15871
15872static PyMethodDef _string_methods[] = {
15873 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15874 METH_O, PyDoc_STR("split the argument as a field name")},
15875 {"formatter_parser", (PyCFunction) formatter_parser,
15876 METH_O, PyDoc_STR("parse the argument as a format string")},
15877 {NULL, NULL}
15878};
15879
15880static struct PyModuleDef _string_module = {
15881 PyModuleDef_HEAD_INIT,
15882 "_string",
15883 PyDoc_STR("string helper module"),
15884 0,
15885 _string_methods,
15886 NULL,
15887 NULL,
15888 NULL,
15889 NULL
15890};
15891
15892PyMODINIT_FUNC
15893PyInit__string(void)
15894{
15895 return PyModule_Create(&_string_module);
15896}
15897
15898
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015899#ifdef __cplusplus
15900}
15901#endif