blob: 0fe7b5658bef36a68fb565a718951ee3c0f1c524 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Victor Stinner709d23d2019-05-02 14:56:30 -0400268static PyObject *
269unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270 const char *errors);
271static PyObject *
272unicode_decode_utf8(const char *s, Py_ssize_t size,
273 _Py_error_handler error_handler, const char *errors,
274 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200276/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200277static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279/* Single character Unicode strings in the Latin-1 range are being
280 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200281static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282
Christian Heimes190d79e2008-01-30 11:58:22 +0000283/* Fast detection of the most frequent whitespace characters */
284const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000286/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000C: * FORM FEED */
290/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 1, 1, 1, 1, 1, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* case 0x001C: * FILE SEPARATOR */
294/* case 0x001D: * GROUP SEPARATOR */
295/* case 0x001E: * RECORD SEPARATOR */
296/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 1, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000303
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000312};
313
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200314/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200315static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100317static int unicode_modifiable(PyObject *unicode);
318
Victor Stinnerfe226c02011-10-03 03:52:20 +0200319
Alexander Belopolsky40018472011-02-26 01:02:56 +0000320static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100321_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200322static PyObject *
323_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
324static PyObject *
325_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
326
327static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000328unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000329 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100330 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
332
Alexander Belopolsky40018472011-02-26 01:02:56 +0000333static void
334raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300335 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100336 PyObject *unicode,
337 Py_ssize_t startpos, Py_ssize_t endpos,
338 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000339
Christian Heimes190d79e2008-01-30 11:58:22 +0000340/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200341static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000343/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000344/* 0x000B, * LINE TABULATION */
345/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000348 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x001C, * FILE SEPARATOR */
350/* 0x001D, * GROUP SEPARATOR */
351/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000352 0, 0, 0, 0, 1, 1, 1, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000357
Benjamin Peterson14339b62009-01-31 16:36:08 +0000358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000366};
367
INADA Naoki3ae20562017-01-16 20:41:20 +0900368static int convert_uc(PyObject *obj, void *addr);
369
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300370#include "clinic/unicodeobject.c.h"
371
Victor Stinner3d4226a2018-08-29 22:21:32 +0200372_Py_error_handler
373_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200374{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200376 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200385 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200394 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_OTHER;
397}
398
Victor Stinner709d23d2019-05-02 14:56:30 -0400399
400static _Py_error_handler
401get_error_handler_wide(const wchar_t *errors)
402{
403 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
404 return _Py_ERROR_STRICT;
405 }
406 if (wcscmp(errors, L"surrogateescape") == 0) {
407 return _Py_ERROR_SURROGATEESCAPE;
408 }
409 if (wcscmp(errors, L"replace") == 0) {
410 return _Py_ERROR_REPLACE;
411 }
412 if (wcscmp(errors, L"ignore") == 0) {
413 return _Py_ERROR_IGNORE;
414 }
415 if (wcscmp(errors, L"backslashreplace") == 0) {
416 return _Py_ERROR_BACKSLASHREPLACE;
417 }
418 if (wcscmp(errors, L"surrogatepass") == 0) {
419 return _Py_ERROR_SURROGATEPASS;
420 }
421 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
422 return _Py_ERROR_XMLCHARREFREPLACE;
423 }
424 return _Py_ERROR_OTHER;
425}
426
427
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300428/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
429 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000430Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000431PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000432{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000433#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000434 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000435#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000436 /* This is actually an illegal character, so it should
437 not be passed to unichr. */
438 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000439#endif
440}
441
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200442int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100443_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200444{
445 PyASCIIObject *ascii;
446 unsigned int kind;
447
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200448 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200449
450 ascii = (PyASCIIObject *)op;
451 kind = ascii->state.kind;
452
Victor Stinnera3b334d2011-10-03 13:53:37 +0200453 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200454 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
455 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200456 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200457 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200458 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200459 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200460
Victor Stinnera41463c2011-10-04 01:05:08 +0200461 if (ascii->state.compact == 1) {
462 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200463 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
464 || kind == PyUnicode_2BYTE_KIND
465 || kind == PyUnicode_4BYTE_KIND);
466 _PyObject_ASSERT(op, ascii->state.ascii == 0);
467 _PyObject_ASSERT(op, ascii->state.ready == 1);
468 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100469 }
470 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
472
473 data = unicode->data.any;
474 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200475 _PyObject_ASSERT(op, ascii->length == 0);
476 _PyObject_ASSERT(op, ascii->hash == -1);
477 _PyObject_ASSERT(op, ascii->state.compact == 0);
478 _PyObject_ASSERT(op, ascii->state.ascii == 0);
479 _PyObject_ASSERT(op, ascii->state.ready == 0);
480 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
481 _PyObject_ASSERT(op, ascii->wstr != NULL);
482 _PyObject_ASSERT(op, data == NULL);
483 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200484 }
485 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200486 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
487 || kind == PyUnicode_2BYTE_KIND
488 || kind == PyUnicode_4BYTE_KIND);
489 _PyObject_ASSERT(op, ascii->state.compact == 0);
490 _PyObject_ASSERT(op, ascii->state.ready == 1);
491 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200492 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200493 _PyObject_ASSERT(op, compact->utf8 == data);
494 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200495 }
496 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200497 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200498 }
499 }
500 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200501 if (
502#if SIZEOF_WCHAR_T == 2
503 kind == PyUnicode_2BYTE_KIND
504#else
505 kind == PyUnicode_4BYTE_KIND
506#endif
507 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200508 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200509 _PyObject_ASSERT(op, ascii->wstr == data);
510 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200511 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200512 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200513 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200514
515 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200516 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200518 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200519 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520
521 /* check that the best kind is used: O(n) operation */
522 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200523 Py_ssize_t i;
524 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200525 void *data;
526 Py_UCS4 ch;
527
528 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200529 for (i=0; i < ascii->length; i++)
530 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200531 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200532 if (ch > maxchar)
533 maxchar = ch;
534 }
535 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100536 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 _PyObject_ASSERT(op, maxchar >= 128);
538 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100539 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200540 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200541 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200542 }
Victor Stinner77faf692011-11-20 18:56:05 +0100543 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200544 _PyObject_ASSERT(op, maxchar >= 0x100);
545 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100546 }
547 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200548 _PyObject_ASSERT(op, maxchar >= 0x10000);
549 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100550 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200551 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200552 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400553 return 1;
554}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200555
Victor Stinner910337b2011-10-03 03:20:16 +0200556
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100557static PyObject*
558unicode_result_wchar(PyObject *unicode)
559{
560#ifndef Py_DEBUG
561 Py_ssize_t len;
562
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100563 len = _PyUnicode_WSTR_LENGTH(unicode);
564 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100565 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200566 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100567 }
568
569 if (len == 1) {
570 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100571 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
573 Py_DECREF(unicode);
574 return latin1_char;
575 }
576 }
577
578 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200579 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100580 return NULL;
581 }
582#else
Victor Stinneraa771272012-10-04 02:32:58 +0200583 assert(Py_REFCNT(unicode) == 1);
584
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100585 /* don't make the result ready in debug mode to ensure that the caller
586 makes the string ready before using it */
587 assert(_PyUnicode_CheckConsistency(unicode, 1));
588#endif
589 return unicode;
590}
591
592static PyObject*
593unicode_result_ready(PyObject *unicode)
594{
595 Py_ssize_t length;
596
597 length = PyUnicode_GET_LENGTH(unicode);
598 if (length == 0) {
599 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100600 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200601 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100602 }
603 return unicode_empty;
604 }
605
606 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200607 void *data = PyUnicode_DATA(unicode);
608 int kind = PyUnicode_KIND(unicode);
609 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100610 if (ch < 256) {
611 PyObject *latin1_char = unicode_latin1[ch];
612 if (latin1_char != NULL) {
613 if (unicode != latin1_char) {
614 Py_INCREF(latin1_char);
615 Py_DECREF(unicode);
616 }
617 return latin1_char;
618 }
619 else {
620 assert(_PyUnicode_CheckConsistency(unicode, 1));
621 Py_INCREF(unicode);
622 unicode_latin1[ch] = unicode;
623 return unicode;
624 }
625 }
626 }
627
628 assert(_PyUnicode_CheckConsistency(unicode, 1));
629 return unicode;
630}
631
632static PyObject*
633unicode_result(PyObject *unicode)
634{
635 assert(_PyUnicode_CHECK(unicode));
636 if (PyUnicode_IS_READY(unicode))
637 return unicode_result_ready(unicode);
638 else
639 return unicode_result_wchar(unicode);
640}
641
Victor Stinnerc4b49542011-12-11 22:44:26 +0100642static PyObject*
643unicode_result_unchanged(PyObject *unicode)
644{
645 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500646 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100647 return NULL;
648 Py_INCREF(unicode);
649 return unicode;
650 }
651 else
652 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100653 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100654}
655
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200656/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
657 ASCII, Latin1, UTF-8, etc. */
658static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200659backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200660 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
661{
Victor Stinnerad771582015-10-09 12:38:53 +0200662 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200663 Py_UCS4 ch;
664 enum PyUnicode_Kind kind;
665 void *data;
666
667 assert(PyUnicode_IS_READY(unicode));
668 kind = PyUnicode_KIND(unicode);
669 data = PyUnicode_DATA(unicode);
670
671 size = 0;
672 /* determine replacement size */
673 for (i = collstart; i < collend; ++i) {
674 Py_ssize_t incr;
675
676 ch = PyUnicode_READ(kind, data, i);
677 if (ch < 0x100)
678 incr = 2+2;
679 else if (ch < 0x10000)
680 incr = 2+4;
681 else {
682 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200683 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200684 }
685 if (size > PY_SSIZE_T_MAX - incr) {
686 PyErr_SetString(PyExc_OverflowError,
687 "encoded result is too long for a Python string");
688 return NULL;
689 }
690 size += incr;
691 }
692
Victor Stinnerad771582015-10-09 12:38:53 +0200693 str = _PyBytesWriter_Prepare(writer, str, size);
694 if (str == NULL)
695 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200696
697 /* generate replacement */
698 for (i = collstart; i < collend; ++i) {
699 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200700 *str++ = '\\';
701 if (ch >= 0x00010000) {
702 *str++ = 'U';
703 *str++ = Py_hexdigits[(ch>>28)&0xf];
704 *str++ = Py_hexdigits[(ch>>24)&0xf];
705 *str++ = Py_hexdigits[(ch>>20)&0xf];
706 *str++ = Py_hexdigits[(ch>>16)&0xf];
707 *str++ = Py_hexdigits[(ch>>12)&0xf];
708 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200709 }
Victor Stinner797485e2015-10-09 03:17:30 +0200710 else if (ch >= 0x100) {
711 *str++ = 'u';
712 *str++ = Py_hexdigits[(ch>>12)&0xf];
713 *str++ = Py_hexdigits[(ch>>8)&0xf];
714 }
715 else
716 *str++ = 'x';
717 *str++ = Py_hexdigits[(ch>>4)&0xf];
718 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200719 }
720 return str;
721}
722
723/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
724 ASCII, Latin1, UTF-8, etc. */
725static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200726xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200727 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
728{
Victor Stinnerad771582015-10-09 12:38:53 +0200729 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200730 Py_UCS4 ch;
731 enum PyUnicode_Kind kind;
732 void *data;
733
734 assert(PyUnicode_IS_READY(unicode));
735 kind = PyUnicode_KIND(unicode);
736 data = PyUnicode_DATA(unicode);
737
738 size = 0;
739 /* determine replacement size */
740 for (i = collstart; i < collend; ++i) {
741 Py_ssize_t incr;
742
743 ch = PyUnicode_READ(kind, data, i);
744 if (ch < 10)
745 incr = 2+1+1;
746 else if (ch < 100)
747 incr = 2+2+1;
748 else if (ch < 1000)
749 incr = 2+3+1;
750 else if (ch < 10000)
751 incr = 2+4+1;
752 else if (ch < 100000)
753 incr = 2+5+1;
754 else if (ch < 1000000)
755 incr = 2+6+1;
756 else {
757 assert(ch <= MAX_UNICODE);
758 incr = 2+7+1;
759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
775 }
776 return str;
777}
778
Thomas Wouters477c8d52006-05-27 19:21:47 +0000779/* --- Bloom Filters ----------------------------------------------------- */
780
781/* stuff to implement simple "bloom filters" for Unicode characters.
782 to keep things simple, we use a single bitmask, using the least 5
783 bits from each unicode characters as the bit index. */
784
785/* the linebreak mask is set up by Unicode_Init below */
786
Antoine Pitrouf068f942010-01-13 14:19:12 +0000787#if LONG_BIT >= 128
788#define BLOOM_WIDTH 128
789#elif LONG_BIT >= 64
790#define BLOOM_WIDTH 64
791#elif LONG_BIT >= 32
792#define BLOOM_WIDTH 32
793#else
794#error "LONG_BIT is smaller than 32"
795#endif
796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797#define BLOOM_MASK unsigned long
798
Serhiy Storchaka05997252013-01-26 12:14:02 +0200799static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000800
Antoine Pitrouf068f942010-01-13 14:19:12 +0000801#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802
Benjamin Peterson29060642009-01-31 22:14:21 +0000803#define BLOOM_LINEBREAK(ch) \
804 ((ch) < 128U ? ascii_linebreak[(ch)] : \
805 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700807static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000809{
Victor Stinnera85af502013-04-09 21:53:54 +0200810#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
811 do { \
812 TYPE *data = (TYPE *)PTR; \
813 TYPE *end = data + LEN; \
814 Py_UCS4 ch; \
815 for (; data != end; data++) { \
816 ch = *data; \
817 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
818 } \
819 break; \
820 } while (0)
821
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 /* calculate simple bloom-style bitmask for a given unicode string */
823
Antoine Pitrouf068f942010-01-13 14:19:12 +0000824 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000825
826 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200827 switch (kind) {
828 case PyUnicode_1BYTE_KIND:
829 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
830 break;
831 case PyUnicode_2BYTE_KIND:
832 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
833 break;
834 case PyUnicode_4BYTE_KIND:
835 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
836 break;
837 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700838 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200839 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000840 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200841
842#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000843}
844
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300845static int
846ensure_unicode(PyObject *obj)
847{
848 if (!PyUnicode_Check(obj)) {
849 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200850 "must be str, not %.100s",
851 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300852 return -1;
853 }
854 return PyUnicode_READY(obj);
855}
856
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857/* Compilation of templated routines */
858
859#include "stringlib/asciilib.h"
860#include "stringlib/fastsearch.h"
861#include "stringlib/partition.h"
862#include "stringlib/split.h"
863#include "stringlib/count.h"
864#include "stringlib/find.h"
865#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200866#include "stringlib/undef.h"
867
868#include "stringlib/ucs1lib.h"
869#include "stringlib/fastsearch.h"
870#include "stringlib/partition.h"
871#include "stringlib/split.h"
872#include "stringlib/count.h"
873#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300874#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200875#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200876#include "stringlib/undef.h"
877
878#include "stringlib/ucs2lib.h"
879#include "stringlib/fastsearch.h"
880#include "stringlib/partition.h"
881#include "stringlib/split.h"
882#include "stringlib/count.h"
883#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300884#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200885#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200886#include "stringlib/undef.h"
887
888#include "stringlib/ucs4lib.h"
889#include "stringlib/fastsearch.h"
890#include "stringlib/partition.h"
891#include "stringlib/split.h"
892#include "stringlib/count.h"
893#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300894#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200895#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200896#include "stringlib/undef.h"
897
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200898#include "stringlib/unicodedefs.h"
899#include "stringlib/fastsearch.h"
900#include "stringlib/count.h"
901#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100902#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200903
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904/* --- Unicode Object ----------------------------------------------------- */
905
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700906static inline Py_ssize_t
907findchar(const void *s, int kind,
908 Py_ssize_t size, Py_UCS4 ch,
909 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200911 switch (kind) {
912 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200913 if ((Py_UCS1) ch != ch)
914 return -1;
915 if (direction > 0)
916 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
917 else
918 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200919 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200920 if ((Py_UCS2) ch != ch)
921 return -1;
922 if (direction > 0)
923 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
924 else
925 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200926 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200927 if (direction > 0)
928 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
929 else
930 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200931 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700932 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934}
935
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000937/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200938 earlier.
939
940 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
941 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
942 invalid character in Unicode 6.0. */
943static void
944unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
945{
946 int kind = PyUnicode_KIND(unicode);
947 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
948 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
949 if (length <= old_length)
950 return;
951 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
952}
953#endif
954
Victor Stinnerfe226c02011-10-03 03:52:20 +0200955static PyObject*
956resize_compact(PyObject *unicode, Py_ssize_t length)
957{
958 Py_ssize_t char_size;
959 Py_ssize_t struct_size;
960 Py_ssize_t new_size;
961 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100962 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200963#ifdef Py_DEBUG
964 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
965#endif
966
Victor Stinner79891572012-05-03 13:43:07 +0200967 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100969 assert(PyUnicode_IS_COMPACT(unicode));
970
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200971 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100972 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 struct_size = sizeof(PyASCIIObject);
974 else
975 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200976 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
979 PyErr_NoMemory();
980 return NULL;
981 }
982 new_size = (struct_size + (length + 1) * char_size);
983
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200984 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
985 PyObject_DEL(_PyUnicode_UTF8(unicode));
986 _PyUnicode_UTF8(unicode) = NULL;
987 _PyUnicode_UTF8_LENGTH(unicode) = 0;
988 }
Victor Stinner84def372011-12-11 20:04:56 +0100989 _Py_DEC_REFTOTAL;
990 _Py_ForgetReference(unicode);
991
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300992 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100993 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100994 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 PyErr_NoMemory();
996 return NULL;
997 }
Victor Stinner84def372011-12-11 20:04:56 +0100998 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001000
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001004 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001005 _PyUnicode_WSTR_LENGTH(unicode) = length;
1006 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001007 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1008 PyObject_DEL(_PyUnicode_WSTR(unicode));
1009 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001010 if (!PyUnicode_IS_ASCII(unicode))
1011 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001012 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001013#ifdef Py_DEBUG
1014 unicode_fill_invalid(unicode, old_length);
1015#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001016 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1017 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 return unicode;
1020}
1021
Alexander Belopolsky40018472011-02-26 01:02:56 +00001022static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001023resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024{
Victor Stinner95663112011-10-04 01:03:50 +02001025 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001026 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 if (PyUnicode_IS_READY(unicode)) {
1031 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001032 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1036#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001037
1038 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001039 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001040 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001042
1043 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1044 PyErr_NoMemory();
1045 return -1;
1046 }
1047 new_size = (length + 1) * char_size;
1048
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1050 {
1051 PyObject_DEL(_PyUnicode_UTF8(unicode));
1052 _PyUnicode_UTF8(unicode) = NULL;
1053 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1054 }
1055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 data = (PyObject *)PyObject_REALLOC(data, new_size);
1057 if (data == NULL) {
1058 PyErr_NoMemory();
1059 return -1;
1060 }
1061 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001062 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 _PyUnicode_WSTR_LENGTH(unicode) = length;
1065 }
1066 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001067 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001068 _PyUnicode_UTF8_LENGTH(unicode) = length;
1069 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 _PyUnicode_LENGTH(unicode) = length;
1071 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001072#ifdef Py_DEBUG
1073 unicode_fill_invalid(unicode, old_length);
1074#endif
Victor Stinner95663112011-10-04 01:03:50 +02001075 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001076 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 }
Victor Stinner95663112011-10-04 01:03:50 +02001080 assert(_PyUnicode_WSTR(unicode) != NULL);
1081
1082 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001083 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001084 PyErr_NoMemory();
1085 return -1;
1086 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001087 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001088 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001089 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001090 if (!wstr) {
1091 PyErr_NoMemory();
1092 return -1;
1093 }
1094 _PyUnicode_WSTR(unicode) = wstr;
1095 _PyUnicode_WSTR(unicode)[length] = 0;
1096 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 return 0;
1099}
1100
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101static PyObject*
1102resize_copy(PyObject *unicode, Py_ssize_t length)
1103{
1104 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001106 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001107
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001108 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109
1110 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1111 if (copy == NULL)
1112 return NULL;
1113
1114 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001115 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001117 }
1118 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001119 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001120
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001121 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 if (w == NULL)
1123 return NULL;
1124 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1125 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001126 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001127 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001128 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 }
1130}
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001133 Ux0000 terminated; some code (e.g. new_identifier)
1134 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135
1136 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001137 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139*/
1140
Alexander Belopolsky40018472011-02-26 01:02:56 +00001141static PyUnicodeObject *
1142_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001144 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146
Thomas Wouters477c8d52006-05-27 19:21:47 +00001147 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 if (length == 0 && unicode_empty != NULL) {
1149 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001150 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 }
1152
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001153 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001154 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001155 return (PyUnicodeObject *)PyErr_NoMemory();
1156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157 if (length < 0) {
1158 PyErr_SetString(PyExc_SystemError,
1159 "Negative size passed to _PyUnicode_New");
1160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 }
1162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1164 if (unicode == NULL)
1165 return NULL;
1166 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001167
1168 _PyUnicode_WSTR_LENGTH(unicode) = length;
1169 _PyUnicode_HASH(unicode) = -1;
1170 _PyUnicode_STATE(unicode).interned = 0;
1171 _PyUnicode_STATE(unicode).kind = 0;
1172 _PyUnicode_STATE(unicode).compact = 0;
1173 _PyUnicode_STATE(unicode).ready = 0;
1174 _PyUnicode_STATE(unicode).ascii = 0;
1175 _PyUnicode_DATA_ANY(unicode) = NULL;
1176 _PyUnicode_LENGTH(unicode) = 0;
1177 _PyUnicode_UTF8(unicode) = NULL;
1178 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1181 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001182 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001183 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001184 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186
Jeremy Hyltond8082792003-09-16 19:41:39 +00001187 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001188 * the caller fails before initializing str -- unicode_resize()
1189 * reads str[0], and the Keep-Alive optimization can keep memory
1190 * allocated for str alive across a call to unicode_dealloc(unicode).
1191 * We don't want unicode_resize to read uninitialized memory in
1192 * that case.
1193 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 _PyUnicode_WSTR(unicode)[0] = 0;
1195 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001196
Victor Stinner7931d9a2011-11-04 00:22:48 +01001197 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 return unicode;
1199}
1200
Victor Stinnerf42dc442011-10-02 23:33:16 +02001201static const char*
1202unicode_kind_name(PyObject *unicode)
1203{
Victor Stinner42dfd712011-10-03 14:41:45 +02001204 /* don't check consistency: unicode_kind_name() is called from
1205 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001206 if (!PyUnicode_IS_COMPACT(unicode))
1207 {
1208 if (!PyUnicode_IS_READY(unicode))
1209 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001210 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001211 {
1212 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001213 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 return "legacy ascii";
1215 else
1216 return "legacy latin1";
1217 case PyUnicode_2BYTE_KIND:
1218 return "legacy UCS2";
1219 case PyUnicode_4BYTE_KIND:
1220 return "legacy UCS4";
1221 default:
1222 return "<legacy invalid kind>";
1223 }
1224 }
1225 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001226 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001227 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001228 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001229 return "ascii";
1230 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001231 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001232 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001233 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001234 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001235 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001236 default:
1237 return "<invalid compact kind>";
1238 }
1239}
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001243char *_PyUnicode_utf8(void *unicode_raw){
1244 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001245 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246}
1247
Victor Stinnera42de742018-11-22 10:25:22 +01001248void *_PyUnicode_compact_data(void *unicode_raw) {
1249 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 return _PyUnicode_COMPACT_DATA(unicode);
1251}
Victor Stinnera42de742018-11-22 10:25:22 +01001252void *_PyUnicode_data(void *unicode_raw) {
1253 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001254 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1256 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1257 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1258 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1259 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1260 return PyUnicode_DATA(unicode);
1261}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001262
1263void
1264_PyUnicode_Dump(PyObject *op)
1265{
1266 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001267 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1268 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1269 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001270
Victor Stinnera849a4b2011-10-03 12:12:11 +02001271 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001272 {
1273 if (ascii->state.ascii)
1274 data = (ascii + 1);
1275 else
1276 data = (compact + 1);
1277 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001278 else
1279 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001280 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1281 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001282
Victor Stinnera849a4b2011-10-03 12:12:11 +02001283 if (ascii->wstr == data)
1284 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001285 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001286
Victor Stinnera3b334d2011-10-03 13:53:37 +02001287 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001288 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001289 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1290 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001291 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001292 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001293 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001294 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001295}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#endif
1297
1298PyObject *
1299PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1300{
1301 PyObject *obj;
1302 PyCompactUnicodeObject *unicode;
1303 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001304 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001305 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 Py_ssize_t char_size;
1307 Py_ssize_t struct_size;
1308
1309 /* Optimization for empty strings */
1310 if (size == 0 && unicode_empty != NULL) {
1311 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001312 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 }
1314
Victor Stinner9e9d6892011-10-04 01:02:02 +02001315 is_ascii = 0;
1316 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 struct_size = sizeof(PyCompactUnicodeObject);
1318 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001319 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 char_size = 1;
1321 is_ascii = 1;
1322 struct_size = sizeof(PyASCIIObject);
1323 }
1324 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001325 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 char_size = 1;
1327 }
1328 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001329 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 char_size = 2;
1331 if (sizeof(wchar_t) == 2)
1332 is_sharing = 1;
1333 }
1334 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001335 if (maxchar > MAX_UNICODE) {
1336 PyErr_SetString(PyExc_SystemError,
1337 "invalid maximum character passed to PyUnicode_New");
1338 return NULL;
1339 }
Victor Stinner8f825062012-04-27 13:55:39 +02001340 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 char_size = 4;
1342 if (sizeof(wchar_t) == 4)
1343 is_sharing = 1;
1344 }
1345
1346 /* Ensure we won't overflow the size. */
1347 if (size < 0) {
1348 PyErr_SetString(PyExc_SystemError,
1349 "Negative size passed to PyUnicode_New");
1350 return NULL;
1351 }
1352 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1353 return PyErr_NoMemory();
1354
1355 /* Duplicated allocation code from _PyObject_New() instead of a call to
1356 * PyObject_New() so we are able to allocate space for the object and
1357 * it's data buffer.
1358 */
1359 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1360 if (obj == NULL)
1361 return PyErr_NoMemory();
1362 obj = PyObject_INIT(obj, &PyUnicode_Type);
1363 if (obj == NULL)
1364 return NULL;
1365
1366 unicode = (PyCompactUnicodeObject *)obj;
1367 if (is_ascii)
1368 data = ((PyASCIIObject*)obj) + 1;
1369 else
1370 data = unicode + 1;
1371 _PyUnicode_LENGTH(unicode) = size;
1372 _PyUnicode_HASH(unicode) = -1;
1373 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001374 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 _PyUnicode_STATE(unicode).compact = 1;
1376 _PyUnicode_STATE(unicode).ready = 1;
1377 _PyUnicode_STATE(unicode).ascii = is_ascii;
1378 if (is_ascii) {
1379 ((char*)data)[size] = 0;
1380 _PyUnicode_WSTR(unicode) = NULL;
1381 }
Victor Stinner8f825062012-04-27 13:55:39 +02001382 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 ((char*)data)[size] = 0;
1384 _PyUnicode_WSTR(unicode) = NULL;
1385 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001387 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 else {
1390 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001391 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001392 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001394 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 ((Py_UCS4*)data)[size] = 0;
1396 if (is_sharing) {
1397 _PyUnicode_WSTR_LENGTH(unicode) = size;
1398 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1399 }
1400 else {
1401 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1402 _PyUnicode_WSTR(unicode) = NULL;
1403 }
1404 }
Victor Stinner8f825062012-04-27 13:55:39 +02001405#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001406 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001407#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001408 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 return obj;
1410}
1411
1412#if SIZEOF_WCHAR_T == 2
1413/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1414 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001415 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416
1417 This function assumes that unicode can hold one more code point than wstr
1418 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001419static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001421 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422{
1423 const wchar_t *iter;
1424 Py_UCS4 *ucs4_out;
1425
Victor Stinner910337b2011-10-03 03:20:16 +02001426 assert(unicode != NULL);
1427 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1429 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1430
1431 for (iter = begin; iter < end; ) {
1432 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1433 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001434 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1435 && (iter+1) < end
1436 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 {
Victor Stinner551ac952011-11-29 22:58:13 +01001438 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 iter += 2;
1440 }
1441 else {
1442 *ucs4_out++ = *iter;
1443 iter++;
1444 }
1445 }
1446 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1447 _PyUnicode_GET_LENGTH(unicode)));
1448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449}
1450#endif
1451
Victor Stinnercd9950f2011-10-02 00:34:53 +02001452static int
Victor Stinner488fa492011-12-12 00:01:39 +01001453unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001454{
Victor Stinner488fa492011-12-12 00:01:39 +01001455 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001456 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001457 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001458 return -1;
1459 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001460 return 0;
1461}
1462
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001463static int
1464_copy_characters(PyObject *to, Py_ssize_t to_start,
1465 PyObject *from, Py_ssize_t from_start,
1466 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001468 unsigned int from_kind, to_kind;
1469 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
Victor Stinneree4544c2012-05-09 22:24:08 +02001471 assert(0 <= how_many);
1472 assert(0 <= from_start);
1473 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001474 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001475 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001476 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477
Victor Stinnerd3f08822012-05-29 12:57:52 +02001478 assert(PyUnicode_Check(to));
1479 assert(PyUnicode_IS_READY(to));
1480 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1481
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001482 if (how_many == 0)
1483 return 0;
1484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001486 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489
Victor Stinnerf1852262012-06-16 16:38:26 +02001490#ifdef Py_DEBUG
1491 if (!check_maxchar
1492 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1493 {
1494 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1495 Py_UCS4 ch;
1496 Py_ssize_t i;
1497 for (i=0; i < how_many; i++) {
1498 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1499 assert(ch <= to_maxchar);
1500 }
1501 }
1502#endif
1503
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001504 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001505 if (check_maxchar
1506 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1507 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001508 /* Writing Latin-1 characters into an ASCII string requires to
1509 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 Py_UCS4 max_char;
1511 max_char = ucs1lib_find_max_char(from_data,
1512 (Py_UCS1*)from_data + how_many);
1513 if (max_char >= 128)
1514 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 }
Christian Heimesf051e432016-09-13 20:22:02 +02001516 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001517 (char*)from_data + from_kind * from_start,
1518 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 else if (from_kind == PyUnicode_1BYTE_KIND
1521 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001522 {
1523 _PyUnicode_CONVERT_BYTES(
1524 Py_UCS1, Py_UCS2,
1525 PyUnicode_1BYTE_DATA(from) + from_start,
1526 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1527 PyUnicode_2BYTE_DATA(to) + to_start
1528 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001529 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001530 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001531 && to_kind == PyUnicode_4BYTE_KIND)
1532 {
1533 _PyUnicode_CONVERT_BYTES(
1534 Py_UCS1, Py_UCS4,
1535 PyUnicode_1BYTE_DATA(from) + from_start,
1536 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1537 PyUnicode_4BYTE_DATA(to) + to_start
1538 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001539 }
1540 else if (from_kind == PyUnicode_2BYTE_KIND
1541 && to_kind == PyUnicode_4BYTE_KIND)
1542 {
1543 _PyUnicode_CONVERT_BYTES(
1544 Py_UCS2, Py_UCS4,
1545 PyUnicode_2BYTE_DATA(from) + from_start,
1546 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1547 PyUnicode_4BYTE_DATA(to) + to_start
1548 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001549 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001550 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001551 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1552
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001553 if (!check_maxchar) {
1554 if (from_kind == PyUnicode_2BYTE_KIND
1555 && to_kind == PyUnicode_1BYTE_KIND)
1556 {
1557 _PyUnicode_CONVERT_BYTES(
1558 Py_UCS2, Py_UCS1,
1559 PyUnicode_2BYTE_DATA(from) + from_start,
1560 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1561 PyUnicode_1BYTE_DATA(to) + to_start
1562 );
1563 }
1564 else if (from_kind == PyUnicode_4BYTE_KIND
1565 && to_kind == PyUnicode_1BYTE_KIND)
1566 {
1567 _PyUnicode_CONVERT_BYTES(
1568 Py_UCS4, Py_UCS1,
1569 PyUnicode_4BYTE_DATA(from) + from_start,
1570 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1571 PyUnicode_1BYTE_DATA(to) + to_start
1572 );
1573 }
1574 else if (from_kind == PyUnicode_4BYTE_KIND
1575 && to_kind == PyUnicode_2BYTE_KIND)
1576 {
1577 _PyUnicode_CONVERT_BYTES(
1578 Py_UCS4, Py_UCS2,
1579 PyUnicode_4BYTE_DATA(from) + from_start,
1580 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1581 PyUnicode_2BYTE_DATA(to) + to_start
1582 );
1583 }
1584 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001585 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001586 }
1587 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001588 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001589 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001590 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001591 Py_ssize_t i;
1592
Victor Stinnera0702ab2011-09-29 14:14:38 +02001593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 if (ch > to_maxchar)
1596 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001597 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1598 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001599 }
1600 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601 return 0;
1602}
1603
Victor Stinnerd3f08822012-05-29 12:57:52 +02001604void
1605_PyUnicode_FastCopyCharacters(
1606 PyObject *to, Py_ssize_t to_start,
1607 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608{
1609 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1610}
1611
1612Py_ssize_t
1613PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1614 PyObject *from, Py_ssize_t from_start,
1615 Py_ssize_t how_many)
1616{
1617 int err;
1618
1619 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1620 PyErr_BadInternalCall();
1621 return -1;
1622 }
1623
Benjamin Petersonbac79492012-01-14 13:34:47 -05001624 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001625 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001626 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001627 return -1;
1628
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001629 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001630 PyErr_SetString(PyExc_IndexError, "string index out of range");
1631 return -1;
1632 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001633 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001634 PyErr_SetString(PyExc_IndexError, "string index out of range");
1635 return -1;
1636 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001637 if (how_many < 0) {
1638 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1639 return -1;
1640 }
1641 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001642 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1643 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001644 "Cannot write %zi characters at %zi "
1645 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001646 how_many, to_start, PyUnicode_GET_LENGTH(to));
1647 return -1;
1648 }
1649
1650 if (how_many == 0)
1651 return 0;
1652
Victor Stinner488fa492011-12-12 00:01:39 +01001653 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001654 return -1;
1655
1656 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1657 if (err) {
1658 PyErr_Format(PyExc_SystemError,
1659 "Cannot copy %s characters "
1660 "into a string of %s characters",
1661 unicode_kind_name(from),
1662 unicode_kind_name(to));
1663 return -1;
1664 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001665 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666}
1667
Victor Stinner17222162011-09-28 22:15:37 +02001668/* Find the maximum code point and count the number of surrogate pairs so a
1669 correct string length can be computed before converting a string to UCS4.
1670 This function counts single surrogates as a character and not as a pair.
1671
1672 Return 0 on success, or -1 on error. */
1673static int
1674find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1675 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676{
1677 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001678 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679
Victor Stinnerc53be962011-10-02 21:33:54 +02001680 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 *num_surrogates = 0;
1682 *maxchar = 0;
1683
1684 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001686 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1687 && (iter+1) < end
1688 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1689 {
1690 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1691 ++(*num_surrogates);
1692 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001696 {
1697 ch = *iter;
1698 iter++;
1699 }
1700 if (ch > *maxchar) {
1701 *maxchar = ch;
1702 if (*maxchar > MAX_UNICODE) {
1703 PyErr_Format(PyExc_ValueError,
1704 "character U+%x is not in range [U+0000; U+10ffff]",
1705 ch);
1706 return -1;
1707 }
1708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 }
1710 return 0;
1711}
1712
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001713int
1714_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715{
1716 wchar_t *end;
1717 Py_UCS4 maxchar = 0;
1718 Py_ssize_t num_surrogates;
1719#if SIZEOF_WCHAR_T == 2
1720 Py_ssize_t length_wo_surrogates;
1721#endif
1722
Georg Brandl7597add2011-10-05 16:36:47 +02001723 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001724 strings were created using _PyObject_New() and where no canonical
1725 representation (the str field) has been set yet aka strings
1726 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001727 assert(_PyUnicode_CHECK(unicode));
1728 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001730 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001731 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001732 /* Actually, it should neither be interned nor be anything else: */
1733 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001736 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001737 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739
1740 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001741 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1742 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 PyErr_NoMemory();
1744 return -1;
1745 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001746 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 _PyUnicode_WSTR(unicode), end,
1748 PyUnicode_1BYTE_DATA(unicode));
1749 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1750 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1751 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1752 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001753 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001754 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001755 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 }
1757 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001758 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001759 _PyUnicode_UTF8(unicode) = NULL;
1760 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 }
1762 PyObject_FREE(_PyUnicode_WSTR(unicode));
1763 _PyUnicode_WSTR(unicode) = NULL;
1764 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1765 }
1766 /* In this case we might have to convert down from 4-byte native
1767 wchar_t to 2-byte unicode. */
1768 else if (maxchar < 65536) {
1769 assert(num_surrogates == 0 &&
1770 "FindMaxCharAndNumSurrogatePairs() messed up");
1771
Victor Stinner506f5922011-09-28 22:34:18 +02001772#if SIZEOF_WCHAR_T == 2
1773 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001775 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1776 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1777 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001778 _PyUnicode_UTF8(unicode) = NULL;
1779 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001780#else
1781 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001782 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001783 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001784 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001785 PyErr_NoMemory();
1786 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 }
Victor Stinner506f5922011-09-28 22:34:18 +02001788 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1789 _PyUnicode_WSTR(unicode), end,
1790 PyUnicode_2BYTE_DATA(unicode));
1791 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1793 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001796 PyObject_FREE(_PyUnicode_WSTR(unicode));
1797 _PyUnicode_WSTR(unicode) = NULL;
1798 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1799#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 }
1801 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1802 else {
1803#if SIZEOF_WCHAR_T == 2
1804 /* in case the native representation is 2-bytes, we need to allocate a
1805 new normalized 4-byte version. */
1806 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001807 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1808 PyErr_NoMemory();
1809 return -1;
1810 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001811 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1812 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 PyErr_NoMemory();
1814 return -1;
1815 }
1816 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1817 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001818 _PyUnicode_UTF8(unicode) = NULL;
1819 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001820 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1821 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001822 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 PyObject_FREE(_PyUnicode_WSTR(unicode));
1824 _PyUnicode_WSTR(unicode) = NULL;
1825 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1826#else
1827 assert(num_surrogates == 0);
1828
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 _PyUnicode_UTF8(unicode) = NULL;
1832 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1834#endif
1835 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1836 }
1837 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001838 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 return 0;
1840}
1841
Alexander Belopolsky40018472011-02-26 01:02:56 +00001842static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001843unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844{
Walter Dörwald16807132007-05-25 13:52:07 +00001845 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 case SSTATE_NOT_INTERNED:
1847 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001848
Benjamin Peterson29060642009-01-31 22:14:21 +00001849 case SSTATE_INTERNED_MORTAL:
1850 /* revive dead object temporarily for DelItem */
1851 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001852 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001853 Py_FatalError(
1854 "deletion of interned string failed");
1855 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001856
Benjamin Peterson29060642009-01-31 22:14:21 +00001857 case SSTATE_INTERNED_IMMORTAL:
1858 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001859 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001860
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 default:
1862 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001863 }
1864
Victor Stinner03490912011-10-03 23:45:12 +02001865 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001867 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001868 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1870 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001872 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873}
1874
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001875#ifdef Py_DEBUG
1876static int
1877unicode_is_singleton(PyObject *unicode)
1878{
1879 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1880 if (unicode == unicode_empty)
1881 return 1;
1882 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1883 {
1884 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1885 if (ch < 256 && unicode_latin1[ch] == unicode)
1886 return 1;
1887 }
1888 return 0;
1889}
1890#endif
1891
Alexander Belopolsky40018472011-02-26 01:02:56 +00001892static int
Victor Stinner488fa492011-12-12 00:01:39 +01001893unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001894{
Victor Stinner488fa492011-12-12 00:01:39 +01001895 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001896 if (Py_REFCNT(unicode) != 1)
1897 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001898 if (_PyUnicode_HASH(unicode) != -1)
1899 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001900 if (PyUnicode_CHECK_INTERNED(unicode))
1901 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001902 if (!PyUnicode_CheckExact(unicode))
1903 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001904#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001905 /* singleton refcount is greater than 1 */
1906 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001907#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001908 return 1;
1909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Victor Stinnerfe226c02011-10-03 03:52:20 +02001911static int
1912unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1913{
1914 PyObject *unicode;
1915 Py_ssize_t old_length;
1916
1917 assert(p_unicode != NULL);
1918 unicode = *p_unicode;
1919
1920 assert(unicode != NULL);
1921 assert(PyUnicode_Check(unicode));
1922 assert(0 <= length);
1923
Victor Stinner910337b2011-10-03 03:20:16 +02001924 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001925 old_length = PyUnicode_WSTR_LENGTH(unicode);
1926 else
1927 old_length = PyUnicode_GET_LENGTH(unicode);
1928 if (old_length == length)
1929 return 0;
1930
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001931 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001932 _Py_INCREF_UNICODE_EMPTY();
1933 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001935 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001936 return 0;
1937 }
1938
Victor Stinner488fa492011-12-12 00:01:39 +01001939 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001940 PyObject *copy = resize_copy(unicode, length);
1941 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001943 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001945 }
1946
Victor Stinnerfe226c02011-10-03 03:52:20 +02001947 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001948 PyObject *new_unicode = resize_compact(unicode, length);
1949 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001950 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001951 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001953 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001954 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001955}
1956
Alexander Belopolsky40018472011-02-26 01:02:56 +00001957int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001958PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001959{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001960 PyObject *unicode;
1961 if (p_unicode == NULL) {
1962 PyErr_BadInternalCall();
1963 return -1;
1964 }
1965 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001966 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 {
1968 PyErr_BadInternalCall();
1969 return -1;
1970 }
1971 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001972}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001973
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001974/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001975
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001976 WARNING: The function doesn't copy the terminating null character and
1977 doesn't check the maximum character (may write a latin1 character in an
1978 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001979static void
1980unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1981 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001982{
1983 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1984 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001985 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001986
1987 switch (kind) {
1988 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001989 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001990#ifdef Py_DEBUG
1991 if (PyUnicode_IS_ASCII(unicode)) {
1992 Py_UCS4 maxchar = ucs1lib_find_max_char(
1993 (const Py_UCS1*)str,
1994 (const Py_UCS1*)str + len);
1995 assert(maxchar < 128);
1996 }
1997#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001998 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001999 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002000 }
2001 case PyUnicode_2BYTE_KIND: {
2002 Py_UCS2 *start = (Py_UCS2 *)data + index;
2003 Py_UCS2 *ucs2 = start;
2004 assert(index <= PyUnicode_GET_LENGTH(unicode));
2005
Victor Stinner184252a2012-06-16 02:57:41 +02002006 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002007 *ucs2 = (Py_UCS2)*str;
2008
2009 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002010 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002011 }
2012 default: {
2013 Py_UCS4 *start = (Py_UCS4 *)data + index;
2014 Py_UCS4 *ucs4 = start;
2015 assert(kind == PyUnicode_4BYTE_KIND);
2016 assert(index <= PyUnicode_GET_LENGTH(unicode));
2017
Victor Stinner184252a2012-06-16 02:57:41 +02002018 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002019 *ucs4 = (Py_UCS4)*str;
2020
2021 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002022 }
2023 }
2024}
2025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026static PyObject*
2027get_latin1_char(unsigned char ch)
2028{
Victor Stinnera464fc12011-10-02 20:39:30 +02002029 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002031 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (!unicode)
2033 return NULL;
2034 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002035 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 unicode_latin1[ch] = unicode;
2037 }
2038 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002039 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040}
2041
Victor Stinner985a82a2014-01-03 12:53:47 +01002042static PyObject*
2043unicode_char(Py_UCS4 ch)
2044{
2045 PyObject *unicode;
2046
2047 assert(ch <= MAX_UNICODE);
2048
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002049 if (ch < 256)
2050 return get_latin1_char(ch);
2051
Victor Stinner985a82a2014-01-03 12:53:47 +01002052 unicode = PyUnicode_New(1, ch);
2053 if (unicode == NULL)
2054 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002055
2056 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2057 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002058 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002059 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002060 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2061 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2062 }
2063 assert(_PyUnicode_CheckConsistency(unicode, 1));
2064 return unicode;
2065}
2066
Alexander Belopolsky40018472011-02-26 01:02:56 +00002067PyObject *
2068PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002070 if (u == NULL)
2071 return (PyObject*)_PyUnicode_New(size);
2072
2073 if (size < 0) {
2074 PyErr_BadInternalCall();
2075 return NULL;
2076 }
2077
2078 return PyUnicode_FromWideChar(u, size);
2079}
2080
2081PyObject *
2082PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2083{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002084 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 Py_UCS4 maxchar = 0;
2086 Py_ssize_t num_surrogates;
2087
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002088 if (u == NULL && size != 0) {
2089 PyErr_BadInternalCall();
2090 return NULL;
2091 }
2092
2093 if (size == -1) {
2094 size = wcslen(u);
2095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002097 /* If the Unicode data is known at construction time, we can apply
2098 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002101 if (size == 0)
2102 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 /* Single character Unicode objects in the Latin-1 range are
2105 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002106 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 return get_latin1_char((unsigned char)*u);
2108
2109 /* If not empty and not single character, copy the Unicode data
2110 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002111 if (find_maxchar_surrogates(u, u + size,
2112 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 return NULL;
2114
Victor Stinner8faf8212011-12-08 22:14:11 +01002115 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 if (!unicode)
2117 return NULL;
2118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 switch (PyUnicode_KIND(unicode)) {
2120 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002121 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2123 break;
2124 case PyUnicode_2BYTE_KIND:
2125#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002126 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002128 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2130#endif
2131 break;
2132 case PyUnicode_4BYTE_KIND:
2133#if SIZEOF_WCHAR_T == 2
2134 /* This is the only case which has to process surrogates, thus
2135 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002136 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137#else
2138 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002139 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140#endif
2141 break;
2142 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002143 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002146 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147}
2148
Alexander Belopolsky40018472011-02-26 01:02:56 +00002149PyObject *
2150PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002151{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 if (size < 0) {
2153 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002154 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 return NULL;
2156 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002157 if (u != NULL)
2158 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2159 else
2160 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002161}
2162
Alexander Belopolsky40018472011-02-26 01:02:56 +00002163PyObject *
2164PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002165{
2166 size_t size = strlen(u);
2167 if (size > PY_SSIZE_T_MAX) {
2168 PyErr_SetString(PyExc_OverflowError, "input too long");
2169 return NULL;
2170 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002171 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002172}
2173
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002174PyObject *
2175_PyUnicode_FromId(_Py_Identifier *id)
2176{
2177 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002178 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2179 strlen(id->string),
2180 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002181 if (!id->object)
2182 return NULL;
2183 PyUnicode_InternInPlace(&id->object);
2184 assert(!id->next);
2185 id->next = static_strings;
2186 static_strings = id;
2187 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002188 return id->object;
2189}
2190
2191void
2192_PyUnicode_ClearStaticStrings()
2193{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002194 _Py_Identifier *tmp, *s = static_strings;
2195 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002196 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002197 tmp = s->next;
2198 s->next = NULL;
2199 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002200 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002201 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002202}
2203
Benjamin Peterson0df54292012-03-26 14:50:32 -04002204/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205
Victor Stinnerd3f08822012-05-29 12:57:52 +02002206PyObject*
2207_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002208{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002209 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002210 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002211 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002212#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002213 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002214#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002215 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002216 }
Victor Stinner785938e2011-12-11 20:09:03 +01002217 unicode = PyUnicode_New(size, 127);
2218 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002219 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002220 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2221 assert(_PyUnicode_CheckConsistency(unicode, 1));
2222 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002223}
2224
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002225static Py_UCS4
2226kind_maxchar_limit(unsigned int kind)
2227{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002228 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002229 case PyUnicode_1BYTE_KIND:
2230 return 0x80;
2231 case PyUnicode_2BYTE_KIND:
2232 return 0x100;
2233 case PyUnicode_4BYTE_KIND:
2234 return 0x10000;
2235 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002236 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002237 }
2238}
2239
Victor Stinner702c7342011-10-05 13:50:52 +02002240static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002241_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002244 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245
Serhiy Storchaka678db842013-01-26 12:16:36 +02002246 if (size == 0)
2247 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002248 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002249 if (size == 1)
2250 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002251
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002252 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002253 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 if (!res)
2255 return NULL;
2256 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002257 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002259}
2260
Victor Stinnere57b1c02011-09-28 22:20:48 +02002261static PyObject*
2262_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263{
2264 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002265 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002266
Serhiy Storchaka678db842013-01-26 12:16:36 +02002267 if (size == 0)
2268 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002269 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002270 if (size == 1)
2271 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002272
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002273 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002274 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 if (!res)
2276 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002277 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002279 else {
2280 _PyUnicode_CONVERT_BYTES(
2281 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2282 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002283 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 return res;
2285}
2286
Victor Stinnere57b1c02011-09-28 22:20:48 +02002287static PyObject*
2288_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289{
2290 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002291 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292
Serhiy Storchaka678db842013-01-26 12:16:36 +02002293 if (size == 0)
2294 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002295 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002296 if (size == 1)
2297 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002298
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002300 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 if (!res)
2302 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002303 if (max_char < 256)
2304 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2305 PyUnicode_1BYTE_DATA(res));
2306 else if (max_char < 0x10000)
2307 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2308 PyUnicode_2BYTE_DATA(res));
2309 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002311 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 return res;
2313}
2314
2315PyObject*
2316PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2317{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002318 if (size < 0) {
2319 PyErr_SetString(PyExc_ValueError, "size must be positive");
2320 return NULL;
2321 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002322 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002324 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002326 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002328 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002329 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002330 PyErr_SetString(PyExc_SystemError, "invalid kind");
2331 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333}
2334
Victor Stinnerece58de2012-04-23 23:36:38 +02002335Py_UCS4
2336_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2337{
2338 enum PyUnicode_Kind kind;
2339 void *startptr, *endptr;
2340
2341 assert(PyUnicode_IS_READY(unicode));
2342 assert(0 <= start);
2343 assert(end <= PyUnicode_GET_LENGTH(unicode));
2344 assert(start <= end);
2345
2346 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2347 return PyUnicode_MAX_CHAR_VALUE(unicode);
2348
2349 if (start == end)
2350 return 127;
2351
Victor Stinner94d558b2012-04-27 22:26:58 +02002352 if (PyUnicode_IS_ASCII(unicode))
2353 return 127;
2354
Victor Stinnerece58de2012-04-23 23:36:38 +02002355 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002356 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002357 endptr = (char *)startptr + end * kind;
2358 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002359 switch(kind) {
2360 case PyUnicode_1BYTE_KIND:
2361 return ucs1lib_find_max_char(startptr, endptr);
2362 case PyUnicode_2BYTE_KIND:
2363 return ucs2lib_find_max_char(startptr, endptr);
2364 case PyUnicode_4BYTE_KIND:
2365 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002366 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002367 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002368 }
2369}
2370
Victor Stinner25a4b292011-10-06 12:31:55 +02002371/* Ensure that a string uses the most efficient storage, if it is not the
2372 case: create a new string with of the right kind. Write NULL into *p_unicode
2373 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002374static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002375unicode_adjust_maxchar(PyObject **p_unicode)
2376{
2377 PyObject *unicode, *copy;
2378 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002379 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002380 unsigned int kind;
2381
2382 assert(p_unicode != NULL);
2383 unicode = *p_unicode;
2384 assert(PyUnicode_IS_READY(unicode));
2385 if (PyUnicode_IS_ASCII(unicode))
2386 return;
2387
2388 len = PyUnicode_GET_LENGTH(unicode);
2389 kind = PyUnicode_KIND(unicode);
2390 if (kind == PyUnicode_1BYTE_KIND) {
2391 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002392 max_char = ucs1lib_find_max_char(u, u + len);
2393 if (max_char >= 128)
2394 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002395 }
2396 else if (kind == PyUnicode_2BYTE_KIND) {
2397 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002398 max_char = ucs2lib_find_max_char(u, u + len);
2399 if (max_char >= 256)
2400 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002401 }
2402 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002403 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002404 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002405 max_char = ucs4lib_find_max_char(u, u + len);
2406 if (max_char >= 0x10000)
2407 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002408 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002409 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002410 if (copy != NULL)
2411 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002412 Py_DECREF(unicode);
2413 *p_unicode = copy;
2414}
2415
Victor Stinner034f6cf2011-09-30 02:26:44 +02002416PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002417_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002418{
Victor Stinner87af4f22011-11-21 23:03:47 +01002419 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002420 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002421
Victor Stinner034f6cf2011-09-30 02:26:44 +02002422 if (!PyUnicode_Check(unicode)) {
2423 PyErr_BadInternalCall();
2424 return NULL;
2425 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002426 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002427 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002428
Victor Stinner87af4f22011-11-21 23:03:47 +01002429 length = PyUnicode_GET_LENGTH(unicode);
2430 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002431 if (!copy)
2432 return NULL;
2433 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2434
Christian Heimesf051e432016-09-13 20:22:02 +02002435 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002436 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002437 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002438 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002439}
2440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441
Victor Stinnerbc603d12011-10-02 01:00:40 +02002442/* Widen Unicode objects to larger buffers. Don't write terminating null
2443 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444
2445void*
2446_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2447{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002448 Py_ssize_t len;
2449 void *result;
2450 unsigned int skind;
2451
Benjamin Petersonbac79492012-01-14 13:34:47 -05002452 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002453 return NULL;
2454
2455 len = PyUnicode_GET_LENGTH(s);
2456 skind = PyUnicode_KIND(s);
2457 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002461 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002462 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002463 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002464 if (!result)
2465 return PyErr_NoMemory();
2466 assert(skind == PyUnicode_1BYTE_KIND);
2467 _PyUnicode_CONVERT_BYTES(
2468 Py_UCS1, Py_UCS2,
2469 PyUnicode_1BYTE_DATA(s),
2470 PyUnicode_1BYTE_DATA(s) + len,
2471 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002473 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002474 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002475 if (!result)
2476 return PyErr_NoMemory();
2477 if (skind == PyUnicode_2BYTE_KIND) {
2478 _PyUnicode_CONVERT_BYTES(
2479 Py_UCS2, Py_UCS4,
2480 PyUnicode_2BYTE_DATA(s),
2481 PyUnicode_2BYTE_DATA(s) + len,
2482 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002484 else {
2485 assert(skind == PyUnicode_1BYTE_KIND);
2486 _PyUnicode_CONVERT_BYTES(
2487 Py_UCS1, Py_UCS4,
2488 PyUnicode_1BYTE_DATA(s),
2489 PyUnicode_1BYTE_DATA(s) + len,
2490 result);
2491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002493 default:
2494 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 }
Victor Stinner01698042011-10-04 00:04:26 +02002496 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 return NULL;
2498}
2499
2500static Py_UCS4*
2501as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
2504 int kind;
2505 void *data;
2506 Py_ssize_t len, targetlen;
2507 if (PyUnicode_READY(string) == -1)
2508 return NULL;
2509 kind = PyUnicode_KIND(string);
2510 data = PyUnicode_DATA(string);
2511 len = PyUnicode_GET_LENGTH(string);
2512 targetlen = len;
2513 if (copy_null)
2514 targetlen++;
2515 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002516 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 if (!target) {
2518 PyErr_NoMemory();
2519 return NULL;
2520 }
2521 }
2522 else {
2523 if (targetsize < targetlen) {
2524 PyErr_Format(PyExc_SystemError,
2525 "string is longer than the buffer");
2526 if (copy_null && 0 < targetsize)
2527 target[0] = 0;
2528 return NULL;
2529 }
2530 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002531 if (kind == PyUnicode_1BYTE_KIND) {
2532 Py_UCS1 *start = (Py_UCS1 *) data;
2533 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002535 else if (kind == PyUnicode_2BYTE_KIND) {
2536 Py_UCS2 *start = (Py_UCS2 *) data;
2537 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2538 }
2539 else {
2540 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002541 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 if (copy_null)
2544 target[len] = 0;
2545 return target;
2546}
2547
2548Py_UCS4*
2549PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2550 int copy_null)
2551{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002552 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 PyErr_BadInternalCall();
2554 return NULL;
2555 }
2556 return as_ucs4(string, target, targetsize, copy_null);
2557}
2558
2559Py_UCS4*
2560PyUnicode_AsUCS4Copy(PyObject *string)
2561{
2562 return as_ucs4(string, NULL, 0, 1);
2563}
2564
Victor Stinner15a11362012-10-06 23:48:20 +02002565/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002566 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2567 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2568#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002569
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002570static int
2571unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2572 Py_ssize_t width, Py_ssize_t precision)
2573{
2574 Py_ssize_t length, fill, arglen;
2575 Py_UCS4 maxchar;
2576
2577 if (PyUnicode_READY(str) == -1)
2578 return -1;
2579
2580 length = PyUnicode_GET_LENGTH(str);
2581 if ((precision == -1 || precision >= length)
2582 && width <= length)
2583 return _PyUnicodeWriter_WriteStr(writer, str);
2584
2585 if (precision != -1)
2586 length = Py_MIN(precision, length);
2587
2588 arglen = Py_MAX(length, width);
2589 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2590 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2591 else
2592 maxchar = writer->maxchar;
2593
2594 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2595 return -1;
2596
2597 if (width > length) {
2598 fill = width - length;
2599 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2600 return -1;
2601 writer->pos += fill;
2602 }
2603
2604 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2605 str, 0, length);
2606 writer->pos += length;
2607 return 0;
2608}
2609
2610static int
Victor Stinner998b8062018-09-12 00:23:25 +02002611unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002612 Py_ssize_t width, Py_ssize_t precision)
2613{
2614 /* UTF-8 */
2615 Py_ssize_t length;
2616 PyObject *unicode;
2617 int res;
2618
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002619 if (precision == -1) {
2620 length = strlen(str);
2621 }
2622 else {
2623 length = 0;
2624 while (length < precision && str[length]) {
2625 length++;
2626 }
2627 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2629 if (unicode == NULL)
2630 return -1;
2631
2632 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2633 Py_DECREF(unicode);
2634 return res;
2635}
2636
Victor Stinner96865452011-03-01 23:44:09 +00002637static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002638unicode_fromformat_arg(_PyUnicodeWriter *writer,
2639 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002640{
Victor Stinnere215d962012-10-06 23:03:36 +02002641 const char *p;
2642 Py_ssize_t len;
2643 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 Py_ssize_t width;
2645 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 int longflag;
2647 int longlongflag;
2648 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002649 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002650
2651 p = f;
2652 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002653 zeropad = 0;
2654 if (*f == '0') {
2655 zeropad = 1;
2656 f++;
2657 }
Victor Stinner96865452011-03-01 23:44:09 +00002658
2659 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 width = -1;
2661 if (Py_ISDIGIT((unsigned)*f)) {
2662 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002663 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002664 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002665 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002666 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002668 return NULL;
2669 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002670 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002671 f++;
2672 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002673 }
2674 precision = -1;
2675 if (*f == '.') {
2676 f++;
2677 if (Py_ISDIGIT((unsigned)*f)) {
2678 precision = (*f - '0');
2679 f++;
2680 while (Py_ISDIGIT((unsigned)*f)) {
2681 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2682 PyErr_SetString(PyExc_ValueError,
2683 "precision too big");
2684 return NULL;
2685 }
2686 precision = (precision * 10) + (*f - '0');
2687 f++;
2688 }
2689 }
Victor Stinner96865452011-03-01 23:44:09 +00002690 if (*f == '%') {
2691 /* "%.3%s" => f points to "3" */
2692 f--;
2693 }
2694 }
2695 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002697 f--;
2698 }
Victor Stinner96865452011-03-01 23:44:09 +00002699
2700 /* Handle %ld, %lu, %lld and %llu. */
2701 longflag = 0;
2702 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002703 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002704 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002705 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002706 longflag = 1;
2707 ++f;
2708 }
Victor Stinner96865452011-03-01 23:44:09 +00002709 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002710 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002711 longlongflag = 1;
2712 f += 2;
2713 }
Victor Stinner96865452011-03-01 23:44:09 +00002714 }
2715 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002716 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002717 size_tflag = 1;
2718 ++f;
2719 }
Victor Stinnere215d962012-10-06 23:03:36 +02002720
2721 if (f[1] == '\0')
2722 writer->overallocate = 0;
2723
2724 switch (*f) {
2725 case 'c':
2726 {
2727 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002728 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002729 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002730 "character argument not in range(0x110000)");
2731 return NULL;
2732 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002733 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002734 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 break;
2736 }
2737
2738 case 'i':
2739 case 'd':
2740 case 'u':
2741 case 'x':
2742 {
2743 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002744 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002746
2747 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002748 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002749 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002750 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002751 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002752 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002753 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002754 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002755 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002756 va_arg(*vargs, size_t));
2757 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002758 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002759 va_arg(*vargs, unsigned int));
2760 }
2761 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002762 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002763 }
2764 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002765 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002766 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002767 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002768 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002769 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002770 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002771 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002772 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002773 va_arg(*vargs, Py_ssize_t));
2774 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002775 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002776 va_arg(*vargs, int));
2777 }
2778 assert(len >= 0);
2779
Victor Stinnere215d962012-10-06 23:03:36 +02002780 if (precision < len)
2781 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002782
2783 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2785 return NULL;
2786
Victor Stinnere215d962012-10-06 23:03:36 +02002787 if (width > precision) {
2788 Py_UCS4 fillchar;
2789 fill = width - precision;
2790 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002791 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2792 return NULL;
2793 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 }
Victor Stinner15a11362012-10-06 23:48:20 +02002795 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002796 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002797 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2798 return NULL;
2799 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002800 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801
Victor Stinner4a587072013-11-19 12:54:53 +01002802 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2803 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002804 break;
2805 }
2806
2807 case 'p':
2808 {
2809 char number[MAX_LONG_LONG_CHARS];
2810
2811 len = sprintf(number, "%p", va_arg(*vargs, void*));
2812 assert(len >= 0);
2813
2814 /* %p is ill-defined: ensure leading 0x. */
2815 if (number[1] == 'X')
2816 number[1] = 'x';
2817 else if (number[1] != 'x') {
2818 memmove(number + 2, number,
2819 strlen(number) + 1);
2820 number[0] = '0';
2821 number[1] = 'x';
2822 len += 2;
2823 }
2824
Victor Stinner4a587072013-11-19 12:54:53 +01002825 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002826 return NULL;
2827 break;
2828 }
2829
2830 case 's':
2831 {
2832 /* UTF-8 */
2833 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002834 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002835 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002836 break;
2837 }
2838
2839 case 'U':
2840 {
2841 PyObject *obj = va_arg(*vargs, PyObject *);
2842 assert(obj && _PyUnicode_CHECK(obj));
2843
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002844 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002845 return NULL;
2846 break;
2847 }
2848
2849 case 'V':
2850 {
2851 PyObject *obj = va_arg(*vargs, PyObject *);
2852 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002853 if (obj) {
2854 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002855 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002856 return NULL;
2857 }
2858 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002859 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002860 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002862 }
2863 break;
2864 }
2865
2866 case 'S':
2867 {
2868 PyObject *obj = va_arg(*vargs, PyObject *);
2869 PyObject *str;
2870 assert(obj);
2871 str = PyObject_Str(obj);
2872 if (!str)
2873 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002874 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002875 Py_DECREF(str);
2876 return NULL;
2877 }
2878 Py_DECREF(str);
2879 break;
2880 }
2881
2882 case 'R':
2883 {
2884 PyObject *obj = va_arg(*vargs, PyObject *);
2885 PyObject *repr;
2886 assert(obj);
2887 repr = PyObject_Repr(obj);
2888 if (!repr)
2889 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002890 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 Py_DECREF(repr);
2892 return NULL;
2893 }
2894 Py_DECREF(repr);
2895 break;
2896 }
2897
2898 case 'A':
2899 {
2900 PyObject *obj = va_arg(*vargs, PyObject *);
2901 PyObject *ascii;
2902 assert(obj);
2903 ascii = PyObject_ASCII(obj);
2904 if (!ascii)
2905 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002906 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002907 Py_DECREF(ascii);
2908 return NULL;
2909 }
2910 Py_DECREF(ascii);
2911 break;
2912 }
2913
2914 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002915 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002916 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 break;
2918
2919 default:
2920 /* if we stumble upon an unknown formatting code, copy the rest
2921 of the format string to the output string. (we cannot just
2922 skip the code, since there's no way to know what's in the
2923 argument list) */
2924 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002925 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002926 return NULL;
2927 f = p+len;
2928 return f;
2929 }
2930
2931 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002932 return f;
2933}
2934
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935PyObject *
2936PyUnicode_FromFormatV(const char *format, va_list vargs)
2937{
Victor Stinnere215d962012-10-06 23:03:36 +02002938 va_list vargs2;
2939 const char *f;
2940 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941
Victor Stinner8f674cc2013-04-17 23:02:17 +02002942 _PyUnicodeWriter_Init(&writer);
2943 writer.min_length = strlen(format) + 100;
2944 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002945
Benjamin Peterson0c212142016-09-20 20:39:33 -07002946 // Copy varags to be able to pass a reference to a subfunction.
2947 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002948
2949 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002950 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 f = unicode_fromformat_arg(&writer, f, &vargs2);
2952 if (f == NULL)
2953 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002956 const char *p;
2957 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002958
Victor Stinnere215d962012-10-06 23:03:36 +02002959 p = f;
2960 do
2961 {
2962 if ((unsigned char)*p > 127) {
2963 PyErr_Format(PyExc_ValueError,
2964 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2965 "string, got a non-ASCII byte: 0x%02x",
2966 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002967 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002968 }
2969 p++;
2970 }
2971 while (*p != '\0' && *p != '%');
2972 len = p - f;
2973
2974 if (*p == '\0')
2975 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002976
2977 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002978 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002979
2980 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002982 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002983 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002984 return _PyUnicodeWriter_Finish(&writer);
2985
2986 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002987 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002988 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002989 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002990}
2991
Walter Dörwaldd2034312007-05-18 16:29:38 +00002992PyObject *
2993PyUnicode_FromFormat(const char *format, ...)
2994{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002995 PyObject* ret;
2996 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002997
2998#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002999 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003000#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003001 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003003 ret = PyUnicode_FromFormatV(format, vargs);
3004 va_end(vargs);
3005 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003006}
3007
Serhiy Storchakac46db922018-10-23 22:58:24 +03003008static Py_ssize_t
3009unicode_get_widechar_size(PyObject *unicode)
3010{
3011 Py_ssize_t res;
3012
3013 assert(unicode != NULL);
3014 assert(_PyUnicode_CHECK(unicode));
3015
3016 if (_PyUnicode_WSTR(unicode) != NULL) {
3017 return PyUnicode_WSTR_LENGTH(unicode);
3018 }
3019 assert(PyUnicode_IS_READY(unicode));
3020
3021 res = _PyUnicode_LENGTH(unicode);
3022#if SIZEOF_WCHAR_T == 2
3023 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3024 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3025 const Py_UCS4 *end = s + res;
3026 for (; s < end; ++s) {
3027 if (*s > 0xFFFF) {
3028 ++res;
3029 }
3030 }
3031 }
3032#endif
3033 return res;
3034}
3035
3036static void
3037unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3038{
3039 const wchar_t *wstr;
3040
3041 assert(unicode != NULL);
3042 assert(_PyUnicode_CHECK(unicode));
3043
3044 wstr = _PyUnicode_WSTR(unicode);
3045 if (wstr != NULL) {
3046 memcpy(w, wstr, size * sizeof(wchar_t));
3047 return;
3048 }
3049 assert(PyUnicode_IS_READY(unicode));
3050
3051 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3052 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3053 for (; size--; ++s, ++w) {
3054 *w = *s;
3055 }
3056 }
3057 else {
3058#if SIZEOF_WCHAR_T == 4
3059 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3060 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3061 for (; size--; ++s, ++w) {
3062 *w = *s;
3063 }
3064#else
3065 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3066 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3067 for (; size--; ++s, ++w) {
3068 Py_UCS4 ch = *s;
3069 if (ch > 0xFFFF) {
3070 assert(ch <= MAX_UNICODE);
3071 /* encode surrogate pair in this case */
3072 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3073 if (!size--)
3074 break;
3075 *w = Py_UNICODE_LOW_SURROGATE(ch);
3076 }
3077 else {
3078 *w = ch;
3079 }
3080 }
3081#endif
3082 }
3083}
3084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003085#ifdef HAVE_WCHAR_H
3086
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003087/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003088
Victor Stinnerd88d9832011-09-06 02:00:05 +02003089 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003090 character) required to convert the unicode object. Ignore size argument.
3091
Victor Stinnerd88d9832011-09-06 02:00:05 +02003092 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003093 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003094 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003095Py_ssize_t
3096PyUnicode_AsWideChar(PyObject *unicode,
3097 wchar_t *w,
3098 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003099{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003100 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003101
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003102 if (unicode == NULL) {
3103 PyErr_BadInternalCall();
3104 return -1;
3105 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003106 if (!PyUnicode_Check(unicode)) {
3107 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003109 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003110
3111 res = unicode_get_widechar_size(unicode);
3112 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003113 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003114 }
3115
3116 if (size > res) {
3117 size = res + 1;
3118 }
3119 else {
3120 res = size;
3121 }
3122 unicode_copy_as_widechar(unicode, w, size);
3123 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003124}
3125
Victor Stinner137c34c2010-09-29 10:25:54 +00003126wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003127PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003128 Py_ssize_t *size)
3129{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003130 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003131 Py_ssize_t buflen;
3132
3133 if (unicode == NULL) {
3134 PyErr_BadInternalCall();
3135 return NULL;
3136 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003137 if (!PyUnicode_Check(unicode)) {
3138 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003139 return NULL;
3140 }
3141
Serhiy Storchakac46db922018-10-23 22:58:24 +03003142 buflen = unicode_get_widechar_size(unicode);
3143 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003144 if (buffer == NULL) {
3145 PyErr_NoMemory();
3146 return NULL;
3147 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003148 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3149 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003151 }
3152 else if (wcslen(buffer) != (size_t)buflen) {
3153 PyMem_FREE(buffer);
3154 PyErr_SetString(PyExc_ValueError,
3155 "embedded null character");
3156 return NULL;
3157 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003158 return buffer;
3159}
3160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003161#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003165{
Victor Stinner8faf8212011-12-08 22:14:11 +01003166 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 PyErr_SetString(PyExc_ValueError,
3168 "chr() arg not in range(0x110000)");
3169 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003170 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003171
Victor Stinner985a82a2014-01-03 12:53:47 +01003172 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003173}
3174
Alexander Belopolsky40018472011-02-26 01:02:56 +00003175PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003176PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003178 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003180 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003181 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003182 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 Py_INCREF(obj);
3184 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003185 }
3186 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 /* For a Unicode subtype that's not a Unicode object,
3188 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003189 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003190 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003191 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003192 "Can't convert '%.100s' object to str implicitly",
3193 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003194 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003195}
3196
Alexander Belopolsky40018472011-02-26 01:02:56 +00003197PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003198PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003199 const char *encoding,
3200 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003201{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003202 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003204
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 PyErr_BadInternalCall();
3207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003209
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003210 /* Decoding bytes objects is the most common case and should be fast */
3211 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003212 if (PyBytes_GET_SIZE(obj) == 0)
3213 _Py_RETURN_UNICODE_EMPTY();
3214 v = PyUnicode_Decode(
3215 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3216 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003217 return v;
3218 }
3219
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003220 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003221 PyErr_SetString(PyExc_TypeError,
3222 "decoding str is not supported");
3223 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003224 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003225
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003226 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3227 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3228 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003229 "decoding to str: need a bytes-like object, %.80s found",
3230 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003231 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003232 }
Tim Petersced69f82003-09-16 20:30:58 +00003233
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003234 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003235 PyBuffer_Release(&buffer);
3236 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003238
Serhiy Storchaka05997252013-01-26 12:14:02 +02003239 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003240 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003241 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242}
3243
Victor Stinnerebe17e02016-10-12 13:57:45 +02003244/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3245 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3246 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003247int
3248_Py_normalize_encoding(const char *encoding,
3249 char *lower,
3250 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003252 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003253 char *l;
3254 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003255 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256
Victor Stinner942889a2016-09-05 15:40:10 -07003257 assert(encoding != NULL);
3258
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003259 e = encoding;
3260 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003261 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003262 punct = 0;
3263 while (1) {
3264 char c = *e;
3265 if (c == 0) {
3266 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003267 }
Victor Stinner942889a2016-09-05 15:40:10 -07003268
3269 if (Py_ISALNUM(c) || c == '.') {
3270 if (punct && l != lower) {
3271 if (l == l_end) {
3272 return 0;
3273 }
3274 *l++ = '_';
3275 }
3276 punct = 0;
3277
3278 if (l == l_end) {
3279 return 0;
3280 }
3281 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003282 }
3283 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003284 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003285 }
Victor Stinner942889a2016-09-05 15:40:10 -07003286
3287 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003288 }
3289 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003290 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003291}
3292
Alexander Belopolsky40018472011-02-26 01:02:56 +00003293PyObject *
3294PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003295 Py_ssize_t size,
3296 const char *encoding,
3297 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003298{
3299 PyObject *buffer = NULL, *unicode;
3300 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003301 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3302
3303 if (encoding == NULL) {
3304 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3305 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003306
Fred Drakee4315f52000-05-09 19:53:39 +00003307 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003308 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3309 char *lower = buflower;
3310
3311 /* Fast paths */
3312 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3313 lower += 3;
3314 if (*lower == '_') {
3315 /* Match "utf8" and "utf_8" */
3316 lower++;
3317 }
3318
3319 if (lower[0] == '8' && lower[1] == 0) {
3320 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3321 }
3322 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3323 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3324 }
3325 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3326 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3327 }
3328 }
3329 else {
3330 if (strcmp(lower, "ascii") == 0
3331 || strcmp(lower, "us_ascii") == 0) {
3332 return PyUnicode_DecodeASCII(s, size, errors);
3333 }
Steve Dowercc16be82016-09-08 10:35:16 -07003334 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003335 else if (strcmp(lower, "mbcs") == 0) {
3336 return PyUnicode_DecodeMBCS(s, size, errors);
3337 }
3338 #endif
3339 else if (strcmp(lower, "latin1") == 0
3340 || strcmp(lower, "latin_1") == 0
3341 || strcmp(lower, "iso_8859_1") == 0
3342 || strcmp(lower, "iso8859_1") == 0) {
3343 return PyUnicode_DecodeLatin1(s, size, errors);
3344 }
3345 }
Victor Stinner37296e82010-06-10 13:36:23 +00003346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347
3348 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003349 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003350 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003351 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003352 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (buffer == NULL)
3354 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003355 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 if (unicode == NULL)
3357 goto onError;
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003360 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003361 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003362 encoding,
3363 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 Py_DECREF(unicode);
3365 goto onError;
3366 }
3367 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003368 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003369
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 Py_XDECREF(buffer);
3372 return NULL;
3373}
3374
Alexander Belopolsky40018472011-02-26 01:02:56 +00003375PyObject *
3376PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003377 const char *encoding,
3378 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003379{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003380 if (!PyUnicode_Check(unicode)) {
3381 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003382 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003383 }
3384
Serhiy Storchaka00939072016-10-27 21:05:49 +03003385 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3386 "PyUnicode_AsDecodedObject() is deprecated; "
3387 "use PyCodec_Decode() to decode from str", 1) < 0)
3388 return NULL;
3389
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003390 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003392
3393 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003394 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 const char *encoding,
3400 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401{
3402 PyObject *v;
3403
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 goto onError;
3407 }
3408
Serhiy Storchaka00939072016-10-27 21:05:49 +03003409 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3410 "PyUnicode_AsDecodedUnicode() is deprecated; "
3411 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3412 return NULL;
3413
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003414 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003415 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416
3417 /* Decode via the codec registry */
3418 v = PyCodec_Decode(unicode, encoding, errors);
3419 if (v == NULL)
3420 goto onError;
3421 if (!PyUnicode_Check(v)) {
3422 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003423 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003424 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003425 encoding,
3426 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003427 Py_DECREF(v);
3428 goto onError;
3429 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003430 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003431
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003433 return NULL;
3434}
3435
Alexander Belopolsky40018472011-02-26 01:02:56 +00003436PyObject *
3437PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003438 Py_ssize_t size,
3439 const char *encoding,
3440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441{
3442 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003443
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003444 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3448 Py_DECREF(unicode);
3449 return v;
3450}
3451
Alexander Belopolsky40018472011-02-26 01:02:56 +00003452PyObject *
3453PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003454 const char *encoding,
3455 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456{
3457 PyObject *v;
3458
3459 if (!PyUnicode_Check(unicode)) {
3460 PyErr_BadArgument();
3461 goto onError;
3462 }
3463
Serhiy Storchaka00939072016-10-27 21:05:49 +03003464 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3465 "PyUnicode_AsEncodedObject() is deprecated; "
3466 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3467 "or PyCodec_Encode() for generic encoding", 1) < 0)
3468 return NULL;
3469
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003470 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003471 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003472
3473 /* Encode via the codec registry */
3474 v = PyCodec_Encode(unicode, encoding, errors);
3475 if (v == NULL)
3476 goto onError;
3477 return v;
3478
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003480 return NULL;
3481}
3482
Victor Stinner1b579672011-12-17 05:47:23 +01003483
Victor Stinner2cba6b82018-01-10 22:46:15 +01003484static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003485unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003486 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003487{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 Py_ssize_t wlen;
3489 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3490 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003492 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003494 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003495 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003496 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003497 return NULL;
3498 }
3499
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003500 char *str;
3501 size_t error_pos;
3502 const char *reason;
3503 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003504 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003505 PyMem_Free(wstr);
3506
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003507 if (res != 0) {
3508 if (res == -2) {
3509 PyObject *exc;
3510 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3511 "locale", unicode,
3512 (Py_ssize_t)error_pos,
3513 (Py_ssize_t)(error_pos+1),
3514 reason);
3515 if (exc != NULL) {
3516 PyCodec_StrictErrors(exc);
3517 Py_DECREF(exc);
3518 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003519 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003520 else if (res == -3) {
3521 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3522 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003523 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003524 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003525 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003526 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003527 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003528
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003529 PyObject *bytes = PyBytes_FromString(str);
3530 PyMem_RawFree(str);
3531 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003532}
3533
Victor Stinnerad158722010-10-27 00:25:46 +00003534PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003535PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3536{
Victor Stinner709d23d2019-05-02 14:56:30 -04003537 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3538 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003539}
3540
3541PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003542PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003543{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003544 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003545#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003546 if (interp->fs_codec.encoding) {
3547 return unicode_encode_utf8(unicode,
3548 interp->fs_codec.error_handler,
3549 interp->fs_codec.errors);
3550 }
3551 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003552 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003553 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003554 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003555 assert(errors != _Py_ERROR_UNKNOWN);
3556 return unicode_encode_utf8(unicode, errors, NULL);
3557 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003558#else
Victor Stinner793b5312011-04-27 00:24:21 +02003559 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3560 cannot use it to encode and decode filenames before it is loaded. Load
3561 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003562 implementation of the locale codec until the codec registry is
3563 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003564 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003565 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003566 interp->fs_codec.encoding,
3567 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003568 }
3569 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003570 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003571 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003572 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003573 assert(errors != _Py_ERROR_UNKNOWN);
3574 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003575 }
Victor Stinnerad158722010-10-27 00:25:46 +00003576#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003577}
3578
Alexander Belopolsky40018472011-02-26 01:02:56 +00003579PyObject *
3580PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003581 const char *encoding,
3582 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583{
3584 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003585 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (!PyUnicode_Check(unicode)) {
3588 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 }
Fred Drakee4315f52000-05-09 19:53:39 +00003591
Victor Stinner942889a2016-09-05 15:40:10 -07003592 if (encoding == NULL) {
3593 return _PyUnicode_AsUTF8String(unicode, errors);
3594 }
3595
Fred Drakee4315f52000-05-09 19:53:39 +00003596 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003597 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3598 char *lower = buflower;
3599
3600 /* Fast paths */
3601 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3602 lower += 3;
3603 if (*lower == '_') {
3604 /* Match "utf8" and "utf_8" */
3605 lower++;
3606 }
3607
3608 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003610 }
3611 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3612 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3613 }
3614 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3615 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3616 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003617 }
Victor Stinner942889a2016-09-05 15:40:10 -07003618 else {
3619 if (strcmp(lower, "ascii") == 0
3620 || strcmp(lower, "us_ascii") == 0) {
3621 return _PyUnicode_AsASCIIString(unicode, errors);
3622 }
Steve Dowercc16be82016-09-08 10:35:16 -07003623#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003624 else if (strcmp(lower, "mbcs") == 0) {
3625 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3626 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003627#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003628 else if (strcmp(lower, "latin1") == 0 ||
3629 strcmp(lower, "latin_1") == 0 ||
3630 strcmp(lower, "iso_8859_1") == 0 ||
3631 strcmp(lower, "iso8859_1") == 0) {
3632 return _PyUnicode_AsLatin1String(unicode, errors);
3633 }
3634 }
Victor Stinner37296e82010-06-10 13:36:23 +00003635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
3637 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003638 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003640 return NULL;
3641
3642 /* The normal path */
3643 if (PyBytes_Check(v))
3644 return v;
3645
3646 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003648 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003649 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003650
3651 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "encoder %s returned bytearray instead of bytes; "
3653 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003654 encoding);
3655 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003656 Py_DECREF(v);
3657 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003660 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3661 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003662 Py_DECREF(v);
3663 return b;
3664 }
3665
3666 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003667 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003668 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003669 encoding,
3670 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003671 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003672 return NULL;
3673}
3674
Alexander Belopolsky40018472011-02-26 01:02:56 +00003675PyObject *
3676PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003677 const char *encoding,
3678 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003679{
3680 PyObject *v;
3681
3682 if (!PyUnicode_Check(unicode)) {
3683 PyErr_BadArgument();
3684 goto onError;
3685 }
3686
Serhiy Storchaka00939072016-10-27 21:05:49 +03003687 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3688 "PyUnicode_AsEncodedUnicode() is deprecated; "
3689 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3690 return NULL;
3691
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003692 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003694
3695 /* Encode via the codec registry */
3696 v = PyCodec_Encode(unicode, encoding, errors);
3697 if (v == NULL)
3698 goto onError;
3699 if (!PyUnicode_Check(v)) {
3700 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003701 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003702 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003703 encoding,
3704 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003705 Py_DECREF(v);
3706 goto onError;
3707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003709
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 return NULL;
3712}
3713
Victor Stinner2cba6b82018-01-10 22:46:15 +01003714static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003715unicode_decode_locale(const char *str, Py_ssize_t len,
3716 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003718 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3719 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003720 return NULL;
3721 }
3722
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003723 wchar_t *wstr;
3724 size_t wlen;
3725 const char *reason;
3726 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003727 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003728 if (res != 0) {
3729 if (res == -2) {
3730 PyObject *exc;
3731 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3732 "locale", str, len,
3733 (Py_ssize_t)wlen,
3734 (Py_ssize_t)(wlen + 1),
3735 reason);
3736 if (exc != NULL) {
3737 PyCodec_StrictErrors(exc);
3738 Py_DECREF(exc);
3739 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003740 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003741 else if (res == -3) {
3742 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3743 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003744 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003745 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003746 }
Victor Stinner2f197072011-12-17 07:08:30 +01003747 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003748 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003749
3750 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3751 PyMem_RawFree(wstr);
3752 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003753}
3754
3755PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003756PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3757 const char *errors)
3758{
Victor Stinner709d23d2019-05-02 14:56:30 -04003759 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3760 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003761}
3762
3763PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003764PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003765{
3766 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003767 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3768 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003769}
3770
3771
3772PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003773PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003774 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003775 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3776}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003777
Christian Heimes5894ba72007-11-04 11:43:14 +00003778PyObject*
3779PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3780{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003781 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003782#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003783 if (interp->fs_codec.encoding) {
3784 return unicode_decode_utf8(s, size,
3785 interp->fs_codec.error_handler,
3786 interp->fs_codec.errors,
3787 NULL);
3788 }
3789 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003790 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003791 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003792 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003793 assert(errors != _Py_ERROR_UNKNOWN);
3794 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3795 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003796#else
Victor Stinner793b5312011-04-27 00:24:21 +02003797 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3798 cannot use it to encode and decode filenames before it is loaded. Load
3799 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003800 implementation of the locale codec until the codec registry is
3801 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003802 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003803 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003804 interp->fs_codec.encoding,
3805 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003806 }
3807 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003808 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003809 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003810 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003811 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003812 }
Victor Stinnerad158722010-10-27 00:25:46 +00003813#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003814}
3815
Martin v. Löwis011e8422009-05-05 04:43:17 +00003816
3817int
3818PyUnicode_FSConverter(PyObject* arg, void* addr)
3819{
Brett Cannonec6ce872016-09-06 15:50:29 -07003820 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003821 PyObject *output = NULL;
3822 Py_ssize_t size;
3823 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003824 if (arg == NULL) {
3825 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003826 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003827 return 1;
3828 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003829 path = PyOS_FSPath(arg);
3830 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003831 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003832 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003833 if (PyBytes_Check(path)) {
3834 output = path;
3835 }
3836 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3837 output = PyUnicode_EncodeFSDefault(path);
3838 Py_DECREF(path);
3839 if (!output) {
3840 return 0;
3841 }
3842 assert(PyBytes_Check(output));
3843 }
3844
Victor Stinner0ea2a462010-04-30 00:22:08 +00003845 size = PyBytes_GET_SIZE(output);
3846 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003847 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003848 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003849 Py_DECREF(output);
3850 return 0;
3851 }
3852 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003853 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003854}
3855
3856
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003857int
3858PyUnicode_FSDecoder(PyObject* arg, void* addr)
3859{
Brett Cannona5711202016-09-06 19:36:01 -07003860 int is_buffer = 0;
3861 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003862 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003863 if (arg == NULL) {
3864 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003865 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003866 return 1;
3867 }
Brett Cannona5711202016-09-06 19:36:01 -07003868
3869 is_buffer = PyObject_CheckBuffer(arg);
3870 if (!is_buffer) {
3871 path = PyOS_FSPath(arg);
3872 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003873 return 0;
3874 }
Brett Cannona5711202016-09-06 19:36:01 -07003875 }
3876 else {
3877 path = arg;
3878 Py_INCREF(arg);
3879 }
3880
3881 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003882 output = path;
3883 }
3884 else if (PyBytes_Check(path) || is_buffer) {
3885 PyObject *path_bytes = NULL;
3886
3887 if (!PyBytes_Check(path) &&
3888 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003889 "path should be string, bytes, or os.PathLike, not %.200s",
3890 Py_TYPE(arg)->tp_name)) {
3891 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003892 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003893 }
3894 path_bytes = PyBytes_FromObject(path);
3895 Py_DECREF(path);
3896 if (!path_bytes) {
3897 return 0;
3898 }
3899 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3900 PyBytes_GET_SIZE(path_bytes));
3901 Py_DECREF(path_bytes);
3902 if (!output) {
3903 return 0;
3904 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003905 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003906 else {
3907 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003908 "path should be string, bytes, or os.PathLike, not %.200s",
3909 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003910 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003911 return 0;
3912 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003913 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003914 Py_DECREF(output);
3915 return 0;
3916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003918 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003919 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003920 Py_DECREF(output);
3921 return 0;
3922 }
3923 *(PyObject**)addr = output;
3924 return Py_CLEANUP_SUPPORTED;
3925}
3926
3927
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003928const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003930{
Christian Heimesf3863112007-11-22 07:46:41 +00003931 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003933 if (!PyUnicode_Check(unicode)) {
3934 PyErr_BadArgument();
3935 return NULL;
3936 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003937 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003940 if (PyUnicode_UTF8(unicode) == NULL) {
3941 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003942 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 if (bytes == NULL)
3944 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3946 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003947 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 Py_DECREF(bytes);
3949 return NULL;
3950 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003951 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003952 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 PyBytes_AS_STRING(bytes),
3954 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 Py_DECREF(bytes);
3956 }
3957
3958 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003959 *psize = PyUnicode_UTF8_LENGTH(unicode);
3960 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003961}
3962
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003963const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3967}
3968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969Py_UNICODE *
3970PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 if (!PyUnicode_Check(unicode)) {
3973 PyErr_BadArgument();
3974 return NULL;
3975 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003976 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3977 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003979 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981
Serhiy Storchakac46db922018-10-23 22:58:24 +03003982 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3983 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3984 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003987 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3988 if (w == NULL) {
3989 PyErr_NoMemory();
3990 return NULL;
3991 }
3992 unicode_copy_as_widechar(unicode, w, wlen + 1);
3993 _PyUnicode_WSTR(unicode) = w;
3994 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3995 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 }
3997 }
3998 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004000 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004001}
4002
Alexander Belopolsky40018472011-02-26 01:02:56 +00004003Py_UNICODE *
4004PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007}
4008
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004009const Py_UNICODE *
4010_PyUnicode_AsUnicode(PyObject *unicode)
4011{
4012 Py_ssize_t size;
4013 const Py_UNICODE *wstr;
4014
4015 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4016 if (wstr && wcslen(wstr) != (size_t)size) {
4017 PyErr_SetString(PyExc_ValueError, "embedded null character");
4018 return NULL;
4019 }
4020 return wstr;
4021}
4022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023
Alexander Belopolsky40018472011-02-26 01:02:56 +00004024Py_ssize_t
4025PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026{
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 goto onError;
4030 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004031 if (_PyUnicode_WSTR(unicode) == NULL) {
4032 if (PyUnicode_AsUnicode(unicode) == NULL)
4033 goto onError;
4034 }
4035 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 return -1;
4039}
4040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041Py_ssize_t
4042PyUnicode_GetLength(PyObject *unicode)
4043{
Victor Stinner07621332012-06-16 04:53:46 +02004044 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 PyErr_BadArgument();
4046 return -1;
4047 }
Victor Stinner07621332012-06-16 04:53:46 +02004048 if (PyUnicode_READY(unicode) == -1)
4049 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 return PyUnicode_GET_LENGTH(unicode);
4051}
4052
4053Py_UCS4
4054PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4055{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004056 void *data;
4057 int kind;
4058
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004059 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004060 PyErr_BadArgument();
4061 return (Py_UCS4)-1;
4062 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004063 if (PyUnicode_READY(unicode) == -1) {
4064 return (Py_UCS4)-1;
4065 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004066 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004067 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 return (Py_UCS4)-1;
4069 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004070 data = PyUnicode_DATA(unicode);
4071 kind = PyUnicode_KIND(unicode);
4072 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004073}
4074
4075int
4076PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4077{
4078 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004079 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 return -1;
4081 }
Victor Stinner488fa492011-12-12 00:01:39 +01004082 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004083 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004084 PyErr_SetString(PyExc_IndexError, "string index out of range");
4085 return -1;
4086 }
Victor Stinner488fa492011-12-12 00:01:39 +01004087 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004088 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004089 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4090 PyErr_SetString(PyExc_ValueError, "character out of range");
4091 return -1;
4092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4094 index, ch);
4095 return 0;
4096}
4097
Alexander Belopolsky40018472011-02-26 01:02:56 +00004098const char *
4099PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004100{
Victor Stinner42cb4622010-09-01 19:39:01 +00004101 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004102}
4103
Victor Stinner554f3f02010-06-16 23:33:54 +00004104/* create or adjust a UnicodeDecodeError */
4105static void
4106make_decode_exception(PyObject **exceptionObject,
4107 const char *encoding,
4108 const char *input, Py_ssize_t length,
4109 Py_ssize_t startpos, Py_ssize_t endpos,
4110 const char *reason)
4111{
4112 if (*exceptionObject == NULL) {
4113 *exceptionObject = PyUnicodeDecodeError_Create(
4114 encoding, input, length, startpos, endpos, reason);
4115 }
4116 else {
4117 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4118 goto onError;
4119 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4120 goto onError;
4121 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4122 goto onError;
4123 }
4124 return;
4125
4126onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004127 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004128}
4129
Steve Dowercc16be82016-09-08 10:35:16 -07004130#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004131static int
4132widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4133{
4134 if (newsize > *size) {
4135 wchar_t *newbuf = *buf;
4136 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4137 PyErr_NoMemory();
4138 return -1;
4139 }
4140 *buf = newbuf;
4141 }
4142 *size = newsize;
4143 return 0;
4144}
4145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146/* error handling callback helper:
4147 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004148 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 and adjust various state variables.
4150 return 0 on success, -1 on error
4151*/
4152
Alexander Belopolsky40018472011-02-26 01:02:56 +00004153static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154unicode_decode_call_errorhandler_wchar(
4155 const char *errors, PyObject **errorHandler,
4156 const char *encoding, const char *reason,
4157 const char **input, const char **inend, Py_ssize_t *startinpos,
4158 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004159 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004161 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162
4163 PyObject *restuple = NULL;
4164 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004165 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004166 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t requiredsize;
4168 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004169 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 wchar_t *repwstr;
4171 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172
4173 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 *errorHandler = PyCodec_LookupError(errors);
4175 if (*errorHandler == NULL)
4176 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 }
4178
Victor Stinner554f3f02010-06-16 23:33:54 +00004179 make_decode_exception(exceptionObject,
4180 encoding,
4181 *input, *inend - *input,
4182 *startinpos, *endinpos,
4183 reason);
4184 if (*exceptionObject == NULL)
4185 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004187 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004191 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004194 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004196
4197 /* Copy back the bytes variables, which might have been modified by the
4198 callback */
4199 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4200 if (!inputobj)
4201 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202 *input = PyBytes_AS_STRING(inputobj);
4203 insize = PyBytes_GET_SIZE(inputobj);
4204 *inend = *input + insize;
4205 /* we can DECREF safely, as the exception has another reference,
4206 so the object won't go away. */
4207 Py_DECREF(inputobj);
4208
4209 if (newpos<0)
4210 newpos = insize+newpos;
4211 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004212 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213 goto onError;
4214 }
4215
4216 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4217 if (repwstr == NULL)
4218 goto onError;
4219 /* need more space? (at least enough for what we
4220 have+the replacement+the rest of the string (starting
4221 at the new input position), so we won't have to check space
4222 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004223 requiredsize = *outpos;
4224 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4225 goto overflow;
4226 requiredsize += repwlen;
4227 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4228 goto overflow;
4229 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004230 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004231 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004232 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004234 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004236 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004238 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 *endinpos = newpos;
4241 *inptr = *input + newpos;
4242
4243 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004244 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245 return 0;
4246
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004247 overflow:
4248 PyErr_SetString(PyExc_OverflowError,
4249 "decoded result is too long for a Python string");
4250
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 onError:
4252 Py_XDECREF(restuple);
4253 return -1;
4254}
Steve Dowercc16be82016-09-08 10:35:16 -07004255#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004256
4257static int
4258unicode_decode_call_errorhandler_writer(
4259 const char *errors, PyObject **errorHandler,
4260 const char *encoding, const char *reason,
4261 const char **input, const char **inend, Py_ssize_t *startinpos,
4262 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4263 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4264{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004265 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266
4267 PyObject *restuple = NULL;
4268 PyObject *repunicode = NULL;
4269 Py_ssize_t insize;
4270 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004271 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004272 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004274 int need_to_grow = 0;
4275 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004276
4277 if (*errorHandler == NULL) {
4278 *errorHandler = PyCodec_LookupError(errors);
4279 if (*errorHandler == NULL)
4280 goto onError;
4281 }
4282
4283 make_decode_exception(exceptionObject,
4284 encoding,
4285 *input, *inend - *input,
4286 *startinpos, *endinpos,
4287 reason);
4288 if (*exceptionObject == NULL)
4289 goto onError;
4290
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004291 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 if (restuple == NULL)
4293 goto onError;
4294 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 goto onError;
4297 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004298 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004300
4301 /* Copy back the bytes variables, which might have been modified by the
4302 callback */
4303 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4304 if (!inputobj)
4305 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004306 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004307 *input = PyBytes_AS_STRING(inputobj);
4308 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004309 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004310 /* we can DECREF safely, as the exception has another reference,
4311 so the object won't go away. */
4312 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004316 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004317 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320
Victor Stinner170ca6f2013-04-18 00:25:28 +02004321 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004322 if (replen > 1) {
4323 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004324 need_to_grow = 1;
4325 }
4326 new_inptr = *input + newpos;
4327 if (*inend - new_inptr > remain) {
4328 /* We don't know the decoding algorithm here so we make the worst
4329 assumption that one byte decodes to one unicode character.
4330 If unfortunately one byte could decode to more unicode characters,
4331 the decoder may write out-of-bound then. Is it possible for the
4332 algorithms using this function? */
4333 writer->min_length += *inend - new_inptr - remain;
4334 need_to_grow = 1;
4335 }
4336 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004337 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004338 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004339 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4340 goto onError;
4341 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004343 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004346 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004349 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355}
4356
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357/* --- UTF-7 Codec -------------------------------------------------------- */
4358
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359/* See RFC2152 for details. We encode conservatively and decode liberally. */
4360
4361/* Three simple macros defining base-64. */
4362
4363/* Is c a base-64 character? */
4364
4365#define IS_BASE64(c) \
4366 (((c) >= 'A' && (c) <= 'Z') || \
4367 ((c) >= 'a' && (c) <= 'z') || \
4368 ((c) >= '0' && (c) <= '9') || \
4369 (c) == '+' || (c) == '/')
4370
4371/* given that c is a base-64 character, what is its base-64 value? */
4372
4373#define FROM_BASE64(c) \
4374 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4375 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4376 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4377 (c) == '+' ? 62 : 63)
4378
4379/* What is the base-64 character of the bottom 6 bits of n? */
4380
4381#define TO_BASE64(n) \
4382 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4383
4384/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4385 * decoded as itself. We are permissive on decoding; the only ASCII
4386 * byte not decoding to itself is the + which begins a base64
4387 * string. */
4388
4389#define DECODE_DIRECT(c) \
4390 ((c) <= 127 && (c) != '+')
4391
4392/* The UTF-7 encoder treats ASCII characters differently according to
4393 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4394 * the above). See RFC2152. This array identifies these different
4395 * sets:
4396 * 0 : "Set D"
4397 * alphanumeric and '(),-./:?
4398 * 1 : "Set O"
4399 * !"#$%&*;<=>@[]^_`{|}
4400 * 2 : "whitespace"
4401 * ht nl cr sp
4402 * 3 : special (must be base64 encoded)
4403 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4404 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405
Tim Petersced69f82003-09-16 20:30:58 +00004406static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407char utf7_category[128] = {
4408/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4409 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4410/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4411 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4412/* sp ! " # $ % & ' ( ) * + , - . / */
4413 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4414/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4416/* @ A B C D E F G H I J K L M N O */
4417 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4418/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4420/* ` a b c d e f g h i j k l m n o */
4421 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4422/* p q r s t u v w x y z { | } ~ del */
4423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424};
4425
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426/* ENCODE_DIRECT: this character should be encoded as itself. The
4427 * answer depends on whether we are encoding set O as itself, and also
4428 * on whether we are encoding whitespace as itself. RFC2152 makes it
4429 * clear that the answers to these questions vary between
4430 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004431
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432#define ENCODE_DIRECT(c, directO, directWS) \
4433 ((c) < 128 && (c) > 0 && \
4434 ((utf7_category[(c)] == 0) || \
4435 (directWS && (utf7_category[(c)] == 2)) || \
4436 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437
Alexander Belopolsky40018472011-02-26 01:02:56 +00004438PyObject *
4439PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004440 Py_ssize_t size,
4441 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004443 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4444}
4445
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446/* The decoder. The only state we preserve is our read position,
4447 * i.e. how many characters we have consumed. So if we end in the
4448 * middle of a shift sequence we have to back off the read position
4449 * and the output to the beginning of the sequence, otherwise we lose
4450 * all the shift state (seen bits, number of bits seen, high
4451 * surrogate). */
4452
Alexander Belopolsky40018472011-02-26 01:02:56 +00004453PyObject *
4454PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004455 Py_ssize_t size,
4456 const char *errors,
4457 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004460 Py_ssize_t startinpos;
4461 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 const char *errmsg = "";
4465 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004466 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 unsigned int base64bits = 0;
4468 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004469 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 PyObject *errorHandler = NULL;
4471 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004473 if (size == 0) {
4474 if (consumed)
4475 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004476 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004477 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004479 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004480 _PyUnicodeWriter_Init(&writer);
4481 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004482
4483 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 e = s + size;
4485
4486 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004487 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004489 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 if (inShift) { /* in a base-64 section */
4492 if (IS_BASE64(ch)) { /* consume a base-64 character */
4493 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4494 base64bits += 6;
4495 s++;
4496 if (base64bits >= 16) {
4497 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004498 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 base64bits -= 16;
4500 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004501 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 if (surrogate) {
4503 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004504 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4505 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004506 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004507 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004509 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 }
4511 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004512 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004513 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 }
4516 }
Victor Stinner551ac952011-11-29 22:58:13 +01004517 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 /* first surrogate */
4519 surrogate = outCh;
4520 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004522 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004523 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 }
4525 }
4526 }
4527 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 if (base64bits > 0) { /* left-over bits */
4530 if (base64bits >= 6) {
4531 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004532 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 errmsg = "partial character in shift sequence";
4534 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 else {
4537 /* Some bits remain; they should be zero */
4538 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004539 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 errmsg = "non-zero padding bits in shift sequence";
4541 goto utf7Error;
4542 }
4543 }
4544 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004545 if (surrogate && DECODE_DIRECT(ch)) {
4546 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4547 goto onError;
4548 }
4549 surrogate = 0;
4550 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 /* '-' is absorbed; other terminating
4552 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004553 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
4556 }
4557 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 s++; /* consume '+' */
4560 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004562 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004563 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004565 else if (s < e && !IS_BASE64(*s)) {
4566 s++;
4567 errmsg = "ill-formed sequence";
4568 goto utf7Error;
4569 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004572 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004575 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576 }
4577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004580 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004581 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 else {
4584 startinpos = s-starts;
4585 s++;
4586 errmsg = "unexpected special character";
4587 goto utf7Error;
4588 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004592 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 errors, &errorHandler,
4594 "utf7", errmsg,
4595 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004596 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 }
4599
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* end of string */
4601
4602 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4603 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004604 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 if (surrogate ||
4606 (base64bits >= 6) ||
4607 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004609 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 errors, &errorHandler,
4611 "utf7", "unterminated shift sequence",
4612 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004613 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 goto onError;
4615 if (s < e)
4616 goto restart;
4617 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619
4620 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004621 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004623 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004624 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004625 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004626 writer.kind, writer.data, shiftOutStart);
4627 Py_XDECREF(errorHandler);
4628 Py_XDECREF(exc);
4629 _PyUnicodeWriter_Dealloc(&writer);
4630 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004631 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004632 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 }
4634 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004635 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 Py_XDECREF(errorHandler);
4640 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004641 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 Py_XDECREF(errorHandler);
4645 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 return NULL;
4648}
4649
4650
Alexander Belopolsky40018472011-02-26 01:02:56 +00004651PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004652_PyUnicode_EncodeUTF7(PyObject *str,
4653 int base64SetO,
4654 int base64WhiteSpace,
4655 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004657 int kind;
4658 void *data;
4659 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004661 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004662 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 unsigned int base64bits = 0;
4664 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 char * out;
4666 char * start;
4667
Benjamin Petersonbac79492012-01-14 13:34:47 -05004668 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004669 return NULL;
4670 kind = PyUnicode_KIND(str);
4671 data = PyUnicode_DATA(str);
4672 len = PyUnicode_GET_LENGTH(str);
4673
4674 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004677 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004678 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004679 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004680 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681 if (v == NULL)
4682 return NULL;
4683
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004684 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004685 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004686 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 if (inShift) {
4689 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4690 /* shifting out */
4691 if (base64bits) { /* output remaining bits */
4692 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4693 base64buffer = 0;
4694 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
4696 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 /* Characters not in the BASE64 set implicitly unshift the sequence
4698 so no '-' is required, except if the character is itself a '-' */
4699 if (IS_BASE64(ch) || ch == '-') {
4700 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 *out++ = (char) ch;
4703 }
4704 else {
4705 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 else { /* not in a shift sequence */
4709 if (ch == '+') {
4710 *out++ = '+';
4711 *out++ = '-';
4712 }
4713 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4714 *out++ = (char) ch;
4715 }
4716 else {
4717 *out++ = '+';
4718 inShift = 1;
4719 goto encode_char;
4720 }
4721 }
4722 continue;
4723encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004725 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004726
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 /* code first surrogate */
4728 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004729 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730 while (base64bits >= 6) {
4731 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4732 base64bits -= 6;
4733 }
4734 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004735 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004736 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 base64bits += 16;
4738 base64buffer = (base64buffer << 16) | ch;
4739 while (base64bits >= 6) {
4740 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4741 base64bits -= 6;
4742 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004743 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744 if (base64bits)
4745 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4746 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 if (_PyBytes_Resize(&v, out - start) < 0)
4749 return NULL;
4750 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004752PyObject *
4753PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4754 Py_ssize_t size,
4755 int base64SetO,
4756 int base64WhiteSpace,
4757 const char *errors)
4758{
4759 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004760 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004761 if (tmp == NULL)
4762 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004763 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004764 base64WhiteSpace, errors);
4765 Py_DECREF(tmp);
4766 return result;
4767}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004768
Antoine Pitrou244651a2009-05-04 18:56:13 +00004769#undef IS_BASE64
4770#undef FROM_BASE64
4771#undef TO_BASE64
4772#undef DECODE_DIRECT
4773#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775/* --- UTF-8 Codec -------------------------------------------------------- */
4776
Alexander Belopolsky40018472011-02-26 01:02:56 +00004777PyObject *
4778PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004779 Py_ssize_t size,
4780 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781{
Walter Dörwald69652032004-09-07 20:24:22 +00004782 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4783}
4784
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785#include "stringlib/asciilib.h"
4786#include "stringlib/codecs.h"
4787#include "stringlib/undef.h"
4788
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004789#include "stringlib/ucs1lib.h"
4790#include "stringlib/codecs.h"
4791#include "stringlib/undef.h"
4792
4793#include "stringlib/ucs2lib.h"
4794#include "stringlib/codecs.h"
4795#include "stringlib/undef.h"
4796
4797#include "stringlib/ucs4lib.h"
4798#include "stringlib/codecs.h"
4799#include "stringlib/undef.h"
4800
Antoine Pitrouab868312009-01-10 15:40:25 +00004801/* Mask to quickly check whether a C 'long' contains a
4802 non-ASCII, UTF8-encoded char. */
4803#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004804# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004805#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004806# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004807#else
4808# error C 'long' size should be either 4 or 8!
4809#endif
4810
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811static Py_ssize_t
4812ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004814 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004815 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004817 /*
4818 * Issue #17237: m68k is a bit different from most architectures in
4819 * that objects do not use "natural alignment" - for example, int and
4820 * long are only aligned at 2-byte boundaries. Therefore the assert()
4821 * won't work; also, tests have shown that skipping the "optimised
4822 * version" will even speed up m68k.
4823 */
4824#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004825#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004826 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4827 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 /* Fast path, see in STRINGLIB(utf8_decode) for
4829 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004830 /* Help allocation */
4831 const char *_p = p;
4832 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 while (_p < aligned_end) {
4834 unsigned long value = *(const unsigned long *) _p;
4835 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 *((unsigned long *)q) = value;
4838 _p += SIZEOF_LONG;
4839 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 p = _p;
4842 while (p < end) {
4843 if ((unsigned char)*p & 0x80)
4844 break;
4845 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004850#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851 while (p < end) {
4852 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4853 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004854 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004855 /* Help allocation */
4856 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 while (_p < aligned_end) {
4858 unsigned long value = *(unsigned long *) _p;
4859 if (value & ASCII_CHAR_MASK)
4860 break;
4861 _p += SIZEOF_LONG;
4862 }
4863 p = _p;
4864 if (_p == end)
4865 break;
4866 }
4867 if ((unsigned char)*p & 0x80)
4868 break;
4869 ++p;
4870 }
4871 memcpy(dest, start, p - start);
4872 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873}
Antoine Pitrouab868312009-01-10 15:40:25 +00004874
Victor Stinner709d23d2019-05-02 14:56:30 -04004875static PyObject *
4876unicode_decode_utf8(const char *s, Py_ssize_t size,
4877 _Py_error_handler error_handler, const char *errors,
4878 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004879{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004880 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004881 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883
4884 Py_ssize_t startinpos;
4885 Py_ssize_t endinpos;
4886 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004887 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004889
4890 if (size == 0) {
4891 if (consumed)
4892 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004893 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004894 }
4895
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4897 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004898 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 *consumed = 1;
4900 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004901 }
4902
Victor Stinner8f674cc2013-04-17 23:02:17 +02004903 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004904 writer.min_length = size;
4905 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004906 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004907
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004908 writer.pos = ascii_decode(s, end, writer.data);
4909 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 while (s < end) {
4911 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004912 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004913
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004915 if (PyUnicode_IS_ASCII(writer.buffer))
4916 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004918 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004920 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 } else {
4922 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 }
4925
4926 switch (ch) {
4927 case 0:
4928 if (s == end || consumed)
4929 goto End;
4930 errmsg = "unexpected end of data";
4931 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004932 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004933 break;
4934 case 1:
4935 errmsg = "invalid start byte";
4936 startinpos = s - starts;
4937 endinpos = startinpos + 1;
4938 break;
4939 case 2:
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004940 case 3:
4941 case 4:
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004942 if (s == end || consumed) {
4943 goto End;
4944 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004945 errmsg = "invalid continuation byte";
4946 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004947 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004948 break;
4949 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004950 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004951 goto onError;
4952 continue;
4953 }
4954
Victor Stinner1d65d912015-10-05 13:43:50 +02004955 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004956 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004957
4958 switch (error_handler) {
4959 case _Py_ERROR_IGNORE:
4960 s += (endinpos - startinpos);
4961 break;
4962
4963 case _Py_ERROR_REPLACE:
4964 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4965 goto onError;
4966 s += (endinpos - startinpos);
4967 break;
4968
4969 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004970 {
4971 Py_ssize_t i;
4972
Victor Stinner1d65d912015-10-05 13:43:50 +02004973 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4974 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004975 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004976 ch = (Py_UCS4)(unsigned char)(starts[i]);
4977 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4978 ch + 0xdc00);
4979 writer.pos++;
4980 }
4981 s += (endinpos - startinpos);
4982 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004983 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004984
4985 default:
4986 if (unicode_decode_call_errorhandler_writer(
4987 errors, &error_handler_obj,
4988 "utf-8", errmsg,
4989 &starts, &end, &startinpos, &endinpos, &exc, &s,
4990 &writer))
4991 goto onError;
4992 }
Victor Stinner785938e2011-12-11 20:09:03 +01004993 }
4994
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004995End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004996 if (consumed)
4997 *consumed = s - starts;
4998
Victor Stinner1d65d912015-10-05 13:43:50 +02004999 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005001 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005002
5003onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005004 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005005 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005006 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005008}
5009
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005010
Victor Stinner709d23d2019-05-02 14:56:30 -04005011PyObject *
5012PyUnicode_DecodeUTF8Stateful(const char *s,
5013 Py_ssize_t size,
5014 const char *errors,
5015 Py_ssize_t *consumed)
5016{
5017 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5018}
5019
5020
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005021/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5022 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005023
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005024 On success, write a pointer to a newly allocated wide character string into
5025 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5026 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005027
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005028 On memory allocation failure, return -1.
5029
5030 On decoding error (if surrogateescape is zero), return -2. If wlen is
5031 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5032 is not NULL, write the decoding error message into *reason. */
5033int
5034_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005035 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005036{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005037 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005038 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005039 wchar_t *unicode;
5040 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005041
Victor Stinner3d4226a2018-08-29 22:21:32 +02005042 int surrogateescape = 0;
5043 int surrogatepass = 0;
5044 switch (errors)
5045 {
5046 case _Py_ERROR_STRICT:
5047 break;
5048 case _Py_ERROR_SURROGATEESCAPE:
5049 surrogateescape = 1;
5050 break;
5051 case _Py_ERROR_SURROGATEPASS:
5052 surrogatepass = 1;
5053 break;
5054 default:
5055 return -3;
5056 }
5057
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005058 /* Note: size will always be longer than the resulting Unicode
5059 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005060 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005061 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005062 }
5063
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005064 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005065 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005066 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005067 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005068
5069 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005070 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005071 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005072 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005073 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005076#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005078#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 if (ch > 0xFF) {
5080#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005081 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005082#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005083 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005084 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005085 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5086 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5087#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005088 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005090 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005091 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005092 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005093
5094 if (surrogateescape) {
5095 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5096 }
5097 else {
5098 /* Is it a valid three-byte code? */
5099 if (surrogatepass
5100 && (e - s) >= 3
5101 && (s[0] & 0xf0) == 0xe0
5102 && (s[1] & 0xc0) == 0x80
5103 && (s[2] & 0xc0) == 0x80)
5104 {
5105 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5106 s += 3;
5107 unicode[outpos++] = ch;
5108 }
5109 else {
5110 PyMem_RawFree(unicode );
5111 if (reason != NULL) {
5112 switch (ch) {
5113 case 0:
5114 *reason = "unexpected end of data";
5115 break;
5116 case 1:
5117 *reason = "invalid start byte";
5118 break;
5119 /* 2, 3, 4 */
5120 default:
5121 *reason = "invalid continuation byte";
5122 break;
5123 }
5124 }
5125 if (wlen != NULL) {
5126 *wlen = s - orig_s;
5127 }
5128 return -2;
5129 }
5130 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005131 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005133 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005134 if (wlen) {
5135 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005136 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005137 *wstr = unicode;
5138 return 0;
5139}
5140
Victor Stinner5f9cf232019-03-19 01:46:25 +01005141
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005142wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005143_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5144 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005145{
5146 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005147 int res = _Py_DecodeUTF8Ex(arg, arglen,
5148 &wstr, wlen,
5149 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005150 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005151 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5152 assert(res != -3);
5153 if (wlen) {
5154 *wlen = (size_t)res;
5155 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005156 return NULL;
5157 }
5158 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005159}
5160
Antoine Pitrouab868312009-01-10 15:40:25 +00005161
Victor Stinnere47e6982017-12-21 15:45:16 +01005162/* UTF-8 encoder using the surrogateescape error handler .
5163
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005164 On success, return 0 and write the newly allocated character string (use
5165 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005166
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005167 On encoding failure, return -2 and write the position of the invalid
5168 surrogate character into *error_pos (if error_pos is set) and the decoding
5169 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005170
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005171 On memory allocation failure, return -1. */
5172int
5173_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005174 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005175{
5176 const Py_ssize_t max_char_size = 4;
5177 Py_ssize_t len = wcslen(text);
5178
5179 assert(len >= 0);
5180
Victor Stinner3d4226a2018-08-29 22:21:32 +02005181 int surrogateescape = 0;
5182 int surrogatepass = 0;
5183 switch (errors)
5184 {
5185 case _Py_ERROR_STRICT:
5186 break;
5187 case _Py_ERROR_SURROGATEESCAPE:
5188 surrogateescape = 1;
5189 break;
5190 case _Py_ERROR_SURROGATEPASS:
5191 surrogatepass = 1;
5192 break;
5193 default:
5194 return -3;
5195 }
5196
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005197 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5198 return -1;
5199 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005200 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 if (raw_malloc) {
5202 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005203 }
5204 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005205 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005206 }
5207 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005208 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005209 }
5210
5211 char *p = bytes;
5212 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005213 for (i = 0; i < len; ) {
5214 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005215 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005216 i++;
5217#if Py_UNICODE_SIZE == 2
5218 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5219 && i < len
5220 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5221 {
5222 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5223 i++;
5224 }
5225#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005226
5227 if (ch < 0x80) {
5228 /* Encode ASCII */
5229 *p++ = (char) ch;
5230
5231 }
5232 else if (ch < 0x0800) {
5233 /* Encode Latin-1 */
5234 *p++ = (char)(0xc0 | (ch >> 6));
5235 *p++ = (char)(0x80 | (ch & 0x3f));
5236 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005237 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005238 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005239 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005240 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005241 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005242 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243 if (reason != NULL) {
5244 *reason = "encoding error";
5245 }
5246 if (raw_malloc) {
5247 PyMem_RawFree(bytes);
5248 }
5249 else {
5250 PyMem_Free(bytes);
5251 }
5252 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005253 }
5254 *p++ = (char)(ch & 0xff);
5255 }
5256 else if (ch < 0x10000) {
5257 *p++ = (char)(0xe0 | (ch >> 12));
5258 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5259 *p++ = (char)(0x80 | (ch & 0x3f));
5260 }
5261 else { /* ch >= 0x10000 */
5262 assert(ch <= MAX_UNICODE);
5263 /* Encode UCS4 Unicode ordinals */
5264 *p++ = (char)(0xf0 | (ch >> 18));
5265 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5266 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5267 *p++ = (char)(0x80 | (ch & 0x3f));
5268 }
5269 }
5270 *p++ = '\0';
5271
5272 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005273 char *bytes2;
5274 if (raw_malloc) {
5275 bytes2 = PyMem_RawRealloc(bytes, final_size);
5276 }
5277 else {
5278 bytes2 = PyMem_Realloc(bytes, final_size);
5279 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005280 if (bytes2 == NULL) {
5281 if (error_pos != NULL) {
5282 *error_pos = (size_t)-1;
5283 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005284 if (raw_malloc) {
5285 PyMem_RawFree(bytes);
5286 }
5287 else {
5288 PyMem_Free(bytes);
5289 }
5290 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005291 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005292 *str = bytes2;
5293 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005294}
5295
5296
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005297/* Primary internal function which creates utf8 encoded bytes objects.
5298
5299 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005300 and allocate exactly as much space needed at the end. Else allocate the
5301 maximum possible needed (4 result bytes per Unicode character), and return
5302 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005303*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005304static PyObject *
5305unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5306 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005307{
Victor Stinner6099a032011-12-18 14:22:26 +01005308 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005309 void *data;
5310 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005311
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005312 if (!PyUnicode_Check(unicode)) {
5313 PyErr_BadArgument();
5314 return NULL;
5315 }
5316
5317 if (PyUnicode_READY(unicode) == -1)
5318 return NULL;
5319
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005320 if (PyUnicode_UTF8(unicode))
5321 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5322 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005323
5324 kind = PyUnicode_KIND(unicode);
5325 data = PyUnicode_DATA(unicode);
5326 size = PyUnicode_GET_LENGTH(unicode);
5327
Benjamin Petersonead6b532011-12-20 17:23:42 -06005328 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005329 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005330 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005331 case PyUnicode_1BYTE_KIND:
5332 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5333 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005334 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005335 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005336 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005337 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005338 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005339 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005340}
5341
Alexander Belopolsky40018472011-02-26 01:02:56 +00005342PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005343_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5344{
5345 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5346}
5347
5348
5349PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005350PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5351 Py_ssize_t size,
5352 const char *errors)
5353{
5354 PyObject *v, *unicode;
5355
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005356 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005357 if (unicode == NULL)
5358 return NULL;
5359 v = _PyUnicode_AsUTF8String(unicode, errors);
5360 Py_DECREF(unicode);
5361 return v;
5362}
5363
5364PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005365PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005366{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005367 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005368}
5369
Walter Dörwald41980ca2007-08-16 21:55:45 +00005370/* --- UTF-32 Codec ------------------------------------------------------- */
5371
5372PyObject *
5373PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005374 Py_ssize_t size,
5375 const char *errors,
5376 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005377{
5378 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5379}
5380
5381PyObject *
5382PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005383 Py_ssize_t size,
5384 const char *errors,
5385 int *byteorder,
5386 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005387{
5388 const char *starts = s;
5389 Py_ssize_t startinpos;
5390 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005391 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005392 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005393 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005394 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005395 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005396 PyObject *errorHandler = NULL;
5397 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005398
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399 q = (unsigned char *)s;
5400 e = q + size;
5401
5402 if (byteorder)
5403 bo = *byteorder;
5404
5405 /* Check for BOM marks (U+FEFF) in the input and adjust current
5406 byte order setting accordingly. In native mode, the leading BOM
5407 mark is skipped, in all other modes, it is copied to the output
5408 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005409 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005410 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005411 if (bom == 0x0000FEFF) {
5412 bo = -1;
5413 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005414 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005415 else if (bom == 0xFFFE0000) {
5416 bo = 1;
5417 q += 4;
5418 }
5419 if (byteorder)
5420 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005421 }
5422
Victor Stinnere64322e2012-10-30 23:12:47 +01005423 if (q == e) {
5424 if (consumed)
5425 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005426 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005427 }
5428
Victor Stinnere64322e2012-10-30 23:12:47 +01005429#ifdef WORDS_BIGENDIAN
5430 le = bo < 0;
5431#else
5432 le = bo <= 0;
5433#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005434 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005435
Victor Stinner8f674cc2013-04-17 23:02:17 +02005436 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005437 writer.min_length = (e - q + 3) / 4;
5438 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005439 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005440
Victor Stinnere64322e2012-10-30 23:12:47 +01005441 while (1) {
5442 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005443 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005444
Victor Stinnere64322e2012-10-30 23:12:47 +01005445 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005446 enum PyUnicode_Kind kind = writer.kind;
5447 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005448 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005449 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005450 if (le) {
5451 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005452 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005453 if (ch > maxch)
5454 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005455 if (kind != PyUnicode_1BYTE_KIND &&
5456 Py_UNICODE_IS_SURROGATE(ch))
5457 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005458 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005459 q += 4;
5460 } while (q <= last);
5461 }
5462 else {
5463 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005464 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005465 if (ch > maxch)
5466 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005467 if (kind != PyUnicode_1BYTE_KIND &&
5468 Py_UNICODE_IS_SURROGATE(ch))
5469 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005470 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005471 q += 4;
5472 } while (q <= last);
5473 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005474 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005475 }
5476
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005477 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005478 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005479 startinpos = ((const char *)q) - starts;
5480 endinpos = startinpos + 4;
5481 }
5482 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005483 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005484 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005485 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005486 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005487 startinpos = ((const char *)q) - starts;
5488 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005489 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005490 else {
5491 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005492 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005493 goto onError;
5494 q += 4;
5495 continue;
5496 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005497 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005498 startinpos = ((const char *)q) - starts;
5499 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005500 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005501
5502 /* The remaining input chars are ignored if the callback
5503 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005504 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005506 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005507 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005508 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005510 }
5511
Walter Dörwald41980ca2007-08-16 21:55:45 +00005512 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005514
Walter Dörwald41980ca2007-08-16 21:55:45 +00005515 Py_XDECREF(errorHandler);
5516 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005517 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518
Benjamin Peterson29060642009-01-31 22:14:21 +00005519 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005520 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005521 Py_XDECREF(errorHandler);
5522 Py_XDECREF(exc);
5523 return NULL;
5524}
5525
5526PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005527_PyUnicode_EncodeUTF32(PyObject *str,
5528 const char *errors,
5529 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005530{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005531 enum PyUnicode_Kind kind;
5532 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005533 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005534 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005535 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005536#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005537 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005538#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005539 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005540#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005541 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005542 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005543 PyObject *errorHandler = NULL;
5544 PyObject *exc = NULL;
5545 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005546
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005547 if (!PyUnicode_Check(str)) {
5548 PyErr_BadArgument();
5549 return NULL;
5550 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005551 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005552 return NULL;
5553 kind = PyUnicode_KIND(str);
5554 data = PyUnicode_DATA(str);
5555 len = PyUnicode_GET_LENGTH(str);
5556
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005557 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005558 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005559 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005560 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005561 if (v == NULL)
5562 return NULL;
5563
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005564 /* output buffer is 4-bytes aligned */
5565 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005566 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005567 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005568 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005569 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005570 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005571
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005572 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005573 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005574 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005575 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005576 else
5577 encoding = "utf-32";
5578
5579 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005580 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5581 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005582 }
5583
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005584 pos = 0;
5585 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005586 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005587
5588 if (kind == PyUnicode_2BYTE_KIND) {
5589 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5590 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005591 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005592 else {
5593 assert(kind == PyUnicode_4BYTE_KIND);
5594 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5595 &out, native_ordering);
5596 }
5597 if (pos == len)
5598 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005599
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005600 rep = unicode_encode_call_errorhandler(
5601 errors, &errorHandler,
5602 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005603 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005604 if (!rep)
5605 goto error;
5606
5607 if (PyBytes_Check(rep)) {
5608 repsize = PyBytes_GET_SIZE(rep);
5609 if (repsize & 3) {
5610 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005611 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005612 "surrogates not allowed");
5613 goto error;
5614 }
5615 moreunits = repsize / 4;
5616 }
5617 else {
5618 assert(PyUnicode_Check(rep));
5619 if (PyUnicode_READY(rep) < 0)
5620 goto error;
5621 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5622 if (!PyUnicode_IS_ASCII(rep)) {
5623 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005624 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005625 "surrogates not allowed");
5626 goto error;
5627 }
5628 }
5629
5630 /* four bytes are reserved for each surrogate */
5631 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005632 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005633 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005634 /* integer overflow */
5635 PyErr_NoMemory();
5636 goto error;
5637 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005638 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005639 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005640 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005641 }
5642
5643 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005644 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005645 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005646 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005647 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005648 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5649 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 }
5651
5652 Py_CLEAR(rep);
5653 }
5654
5655 /* Cut back to size actually needed. This is necessary for, for example,
5656 encoding of a string containing isolated surrogates and the 'ignore'
5657 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005658 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005659 if (nsize != PyBytes_GET_SIZE(v))
5660 _PyBytes_Resize(&v, nsize);
5661 Py_XDECREF(errorHandler);
5662 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005663 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005664 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 error:
5666 Py_XDECREF(rep);
5667 Py_XDECREF(errorHandler);
5668 Py_XDECREF(exc);
5669 Py_XDECREF(v);
5670 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005671}
5672
Alexander Belopolsky40018472011-02-26 01:02:56 +00005673PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005674PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5675 Py_ssize_t size,
5676 const char *errors,
5677 int byteorder)
5678{
5679 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005680 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005681 if (tmp == NULL)
5682 return NULL;
5683 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5684 Py_DECREF(tmp);
5685 return result;
5686}
5687
5688PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005689PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005690{
Victor Stinnerb960b342011-11-20 19:12:52 +01005691 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005692}
5693
Guido van Rossumd57fd912000-03-10 22:53:23 +00005694/* --- UTF-16 Codec ------------------------------------------------------- */
5695
Tim Peters772747b2001-08-09 22:21:55 +00005696PyObject *
5697PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005698 Py_ssize_t size,
5699 const char *errors,
5700 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005701{
Walter Dörwald69652032004-09-07 20:24:22 +00005702 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5703}
5704
5705PyObject *
5706PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005707 Py_ssize_t size,
5708 const char *errors,
5709 int *byteorder,
5710 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005711{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005712 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005713 Py_ssize_t startinpos;
5714 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005715 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005716 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005717 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005718 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005719 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005720 PyObject *errorHandler = NULL;
5721 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005723
Tim Peters772747b2001-08-09 22:21:55 +00005724 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005725 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005726
5727 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005728 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005729
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005730 /* Check for BOM marks (U+FEFF) in the input and adjust current
5731 byte order setting accordingly. In native mode, the leading BOM
5732 mark is skipped, in all other modes, it is copied to the output
5733 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005734 if (bo == 0 && size >= 2) {
5735 const Py_UCS4 bom = (q[1] << 8) | q[0];
5736 if (bom == 0xFEFF) {
5737 q += 2;
5738 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005739 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005740 else if (bom == 0xFFFE) {
5741 q += 2;
5742 bo = 1;
5743 }
5744 if (byteorder)
5745 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005746 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005747
Antoine Pitrou63065d72012-05-15 23:48:04 +02005748 if (q == e) {
5749 if (consumed)
5750 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005751 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005752 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005753
Christian Heimes743e0cd2012-10-17 23:52:17 +02005754#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005755 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005756 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005757#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005758 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005759 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005760#endif
Tim Peters772747b2001-08-09 22:21:55 +00005761
Antoine Pitrou63065d72012-05-15 23:48:04 +02005762 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005763 character count normally. Error handler will take care of
5764 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005765 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005766 writer.min_length = (e - q + 1) / 2;
5767 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005768 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005769
Antoine Pitrou63065d72012-05-15 23:48:04 +02005770 while (1) {
5771 Py_UCS4 ch = 0;
5772 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005773 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005774 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005775 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005776 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005778 native_ordering);
5779 else
5780 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005781 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005782 native_ordering);
5783 } else if (kind == PyUnicode_2BYTE_KIND) {
5784 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005786 native_ordering);
5787 } else {
5788 assert(kind == PyUnicode_4BYTE_KIND);
5789 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005790 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005791 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005792 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005793 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005794
Antoine Pitrou63065d72012-05-15 23:48:04 +02005795 switch (ch)
5796 {
5797 case 0:
5798 /* remaining byte at the end? (size should be even) */
5799 if (q == e || consumed)
5800 goto End;
5801 errmsg = "truncated data";
5802 startinpos = ((const char *)q) - starts;
5803 endinpos = ((const char *)e) - starts;
5804 break;
5805 /* The remaining input chars are ignored if the callback
5806 chooses to skip the input */
5807 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005808 q -= 2;
5809 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005810 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005811 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005812 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005813 endinpos = ((const char *)e) - starts;
5814 break;
5815 case 2:
5816 errmsg = "illegal encoding";
5817 startinpos = ((const char *)q) - 2 - starts;
5818 endinpos = startinpos + 2;
5819 break;
5820 case 3:
5821 errmsg = "illegal UTF-16 surrogate";
5822 startinpos = ((const char *)q) - 4 - starts;
5823 endinpos = startinpos + 2;
5824 break;
5825 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005826 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005827 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005828 continue;
5829 }
5830
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005831 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005832 errors,
5833 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005834 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005835 &starts,
5836 (const char **)&e,
5837 &startinpos,
5838 &endinpos,
5839 &exc,
5840 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005841 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005842 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005843 }
5844
Antoine Pitrou63065d72012-05-15 23:48:04 +02005845End:
Walter Dörwald69652032004-09-07 20:24:22 +00005846 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005847 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005848
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005849 Py_XDECREF(errorHandler);
5850 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005851 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005852
Benjamin Peterson29060642009-01-31 22:14:21 +00005853 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005854 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005855 Py_XDECREF(errorHandler);
5856 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005857 return NULL;
5858}
5859
Tim Peters772747b2001-08-09 22:21:55 +00005860PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005861_PyUnicode_EncodeUTF16(PyObject *str,
5862 const char *errors,
5863 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005864{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005865 enum PyUnicode_Kind kind;
5866 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005867 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005868 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005869 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005870 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005871#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005872 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005873#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005874 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005875#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005876 const char *encoding;
5877 Py_ssize_t nsize, pos;
5878 PyObject *errorHandler = NULL;
5879 PyObject *exc = NULL;
5880 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005881
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005882 if (!PyUnicode_Check(str)) {
5883 PyErr_BadArgument();
5884 return NULL;
5885 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005886 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005887 return NULL;
5888 kind = PyUnicode_KIND(str);
5889 data = PyUnicode_DATA(str);
5890 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005891
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005892 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005893 if (kind == PyUnicode_4BYTE_KIND) {
5894 const Py_UCS4 *in = (const Py_UCS4 *)data;
5895 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005896 while (in < end) {
5897 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005898 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005899 }
5900 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005901 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005902 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005903 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005904 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005905 nsize = len + pairs + (byteorder == 0);
5906 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005907 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005908 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005909 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005910
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005911 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005912 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005913 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005914 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005915 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005916 }
5917 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005918 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005919 }
Tim Peters772747b2001-08-09 22:21:55 +00005920
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005921 if (kind == PyUnicode_1BYTE_KIND) {
5922 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5923 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005924 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005925
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005926 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005927 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005928 }
5929 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005930 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005931 }
5932 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005933 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005934 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005935
5936 pos = 0;
5937 while (pos < len) {
5938 Py_ssize_t repsize, moreunits;
5939
5940 if (kind == PyUnicode_2BYTE_KIND) {
5941 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5942 &out, native_ordering);
5943 }
5944 else {
5945 assert(kind == PyUnicode_4BYTE_KIND);
5946 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5947 &out, native_ordering);
5948 }
5949 if (pos == len)
5950 break;
5951
5952 rep = unicode_encode_call_errorhandler(
5953 errors, &errorHandler,
5954 encoding, "surrogates not allowed",
5955 str, &exc, pos, pos + 1, &pos);
5956 if (!rep)
5957 goto error;
5958
5959 if (PyBytes_Check(rep)) {
5960 repsize = PyBytes_GET_SIZE(rep);
5961 if (repsize & 1) {
5962 raise_encode_exception(&exc, encoding,
5963 str, pos - 1, pos,
5964 "surrogates not allowed");
5965 goto error;
5966 }
5967 moreunits = repsize / 2;
5968 }
5969 else {
5970 assert(PyUnicode_Check(rep));
5971 if (PyUnicode_READY(rep) < 0)
5972 goto error;
5973 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5974 if (!PyUnicode_IS_ASCII(rep)) {
5975 raise_encode_exception(&exc, encoding,
5976 str, pos - 1, pos,
5977 "surrogates not allowed");
5978 goto error;
5979 }
5980 }
5981
5982 /* two bytes are reserved for each surrogate */
5983 if (moreunits > 1) {
5984 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005985 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005986 /* integer overflow */
5987 PyErr_NoMemory();
5988 goto error;
5989 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005990 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005991 goto error;
5992 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5993 }
5994
5995 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005996 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005997 out += moreunits;
5998 } else /* rep is unicode */ {
5999 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6000 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6001 &out, native_ordering);
6002 }
6003
6004 Py_CLEAR(rep);
6005 }
6006
6007 /* Cut back to size actually needed. This is necessary for, for example,
6008 encoding of a string containing isolated surrogates and the 'ignore' handler
6009 is used. */
6010 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6011 if (nsize != PyBytes_GET_SIZE(v))
6012 _PyBytes_Resize(&v, nsize);
6013 Py_XDECREF(errorHandler);
6014 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006015 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006016 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006017 error:
6018 Py_XDECREF(rep);
6019 Py_XDECREF(errorHandler);
6020 Py_XDECREF(exc);
6021 Py_XDECREF(v);
6022 return NULL;
6023#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006024}
6025
Alexander Belopolsky40018472011-02-26 01:02:56 +00006026PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006027PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6028 Py_ssize_t size,
6029 const char *errors,
6030 int byteorder)
6031{
6032 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006033 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006034 if (tmp == NULL)
6035 return NULL;
6036 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6037 Py_DECREF(tmp);
6038 return result;
6039}
6040
6041PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006042PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006043{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006044 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006045}
6046
6047/* --- Unicode Escape Codec ----------------------------------------------- */
6048
Fredrik Lundh06d12682001-01-24 07:59:11 +00006049static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006050
Alexander Belopolsky40018472011-02-26 01:02:56 +00006051PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006052_PyUnicode_DecodeUnicodeEscape(const char *s,
6053 Py_ssize_t size,
6054 const char *errors,
6055 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006056{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006057 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006058 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006059 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006060 PyObject *errorHandler = NULL;
6061 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006062
Eric V. Smith42454af2016-10-31 09:22:08 -04006063 // so we can remember if we've seen an invalid escape char or not
6064 *first_invalid_escape = NULL;
6065
Victor Stinner62ec3312016-09-06 17:04:34 -07006066 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006067 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006068 }
6069 /* Escaped strings will always be longer than the resulting
6070 Unicode string, so we start with size here and then reduce the
6071 length after conversion to the true value.
6072 (but if the error callback returns a long replacement string
6073 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006074 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006075 writer.min_length = size;
6076 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6077 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006078 }
6079
Guido van Rossumd57fd912000-03-10 22:53:23 +00006080 end = s + size;
6081 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006082 unsigned char c = (unsigned char) *s++;
6083 Py_UCS4 ch;
6084 int count;
6085 Py_ssize_t startinpos;
6086 Py_ssize_t endinpos;
6087 const char *message;
6088
6089#define WRITE_ASCII_CHAR(ch) \
6090 do { \
6091 assert(ch <= 127); \
6092 assert(writer.pos < writer.size); \
6093 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6094 } while(0)
6095
6096#define WRITE_CHAR(ch) \
6097 do { \
6098 if (ch <= writer.maxchar) { \
6099 assert(writer.pos < writer.size); \
6100 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6101 } \
6102 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6103 goto onError; \
6104 } \
6105 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006106
6107 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006108 if (c != '\\') {
6109 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110 continue;
6111 }
6112
Victor Stinner62ec3312016-09-06 17:04:34 -07006113 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006115 if (s >= end) {
6116 message = "\\ at end of string";
6117 goto error;
6118 }
6119 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006120
Victor Stinner62ec3312016-09-06 17:04:34 -07006121 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006122 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006123
Benjamin Peterson29060642009-01-31 22:14:21 +00006124 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006125 case '\n': continue;
6126 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6127 case '\'': WRITE_ASCII_CHAR('\''); continue;
6128 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6129 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006130 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006131 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6132 case 't': WRITE_ASCII_CHAR('\t'); continue;
6133 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6134 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006135 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006136 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006137 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006138 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006139
Benjamin Peterson29060642009-01-31 22:14:21 +00006140 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006141 case '0': case '1': case '2': case '3':
6142 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006143 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006144 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006145 ch = (ch<<3) + *s++ - '0';
6146 if (s < end && '0' <= *s && *s <= '7') {
6147 ch = (ch<<3) + *s++ - '0';
6148 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006149 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006150 WRITE_CHAR(ch);
6151 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006152
Benjamin Peterson29060642009-01-31 22:14:21 +00006153 /* hex escapes */
6154 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006155 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006156 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006157 message = "truncated \\xXX escape";
6158 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159
Benjamin Peterson29060642009-01-31 22:14:21 +00006160 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006161 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006162 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006163 message = "truncated \\uXXXX escape";
6164 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165
Benjamin Peterson29060642009-01-31 22:14:21 +00006166 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006167 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006168 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006169 message = "truncated \\UXXXXXXXX escape";
6170 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006171 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006172 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006173 ch <<= 4;
6174 if (c >= '0' && c <= '9') {
6175 ch += c - '0';
6176 }
6177 else if (c >= 'a' && c <= 'f') {
6178 ch += c - ('a' - 10);
6179 }
6180 else if (c >= 'A' && c <= 'F') {
6181 ch += c - ('A' - 10);
6182 }
6183 else {
6184 break;
6185 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006186 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006187 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006188 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006189 }
6190
6191 /* when we get here, ch is a 32-bit unicode character */
6192 if (ch > MAX_UNICODE) {
6193 message = "illegal Unicode character";
6194 goto error;
6195 }
6196
6197 WRITE_CHAR(ch);
6198 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006199
Benjamin Peterson29060642009-01-31 22:14:21 +00006200 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006201 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006202 if (ucnhash_CAPI == NULL) {
6203 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006204 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6205 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006206 if (ucnhash_CAPI == NULL) {
6207 PyErr_SetString(
6208 PyExc_UnicodeError,
6209 "\\N escapes not supported (can't load unicodedata module)"
6210 );
6211 goto onError;
6212 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006213 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006214
6215 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006216 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006217 const char *start = ++s;
6218 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006219 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006220 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006221 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006222 namelen = s - start;
6223 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006224 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006225 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 ch = 0xffffffff; /* in case 'getcode' messes up */
6227 if (namelen <= INT_MAX &&
6228 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6229 &ch, 0)) {
6230 assert(ch <= MAX_UNICODE);
6231 WRITE_CHAR(ch);
6232 continue;
6233 }
6234 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006235 }
6236 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006237 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006238
6239 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006240 if (*first_invalid_escape == NULL) {
6241 *first_invalid_escape = s-1; /* Back up one char, since we've
6242 already incremented s. */
6243 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006244 WRITE_ASCII_CHAR('\\');
6245 WRITE_CHAR(c);
6246 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006247 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006248
6249 error:
6250 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006251 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006252 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006253 errors, &errorHandler,
6254 "unicodeescape", message,
6255 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006256 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006257 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006259 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006260
6261#undef WRITE_ASCII_CHAR
6262#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006264
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006265 Py_XDECREF(errorHandler);
6266 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006267 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006268
Benjamin Peterson29060642009-01-31 22:14:21 +00006269 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006270 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006271 Py_XDECREF(errorHandler);
6272 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006273 return NULL;
6274}
6275
Eric V. Smith42454af2016-10-31 09:22:08 -04006276PyObject *
6277PyUnicode_DecodeUnicodeEscape(const char *s,
6278 Py_ssize_t size,
6279 const char *errors)
6280{
6281 const char *first_invalid_escape;
6282 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6283 &first_invalid_escape);
6284 if (result == NULL)
6285 return NULL;
6286 if (first_invalid_escape != NULL) {
6287 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6288 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006289 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006290 Py_DECREF(result);
6291 return NULL;
6292 }
6293 }
6294 return result;
6295}
6296
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006297/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006298
Alexander Belopolsky40018472011-02-26 01:02:56 +00006299PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006300PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006301{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006302 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006303 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006304 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006305 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006306 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308
Ezio Melottie7f90372012-10-05 03:33:31 +03006309 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006310 escape.
6311
Ezio Melottie7f90372012-10-05 03:33:31 +03006312 For UCS1 strings it's '\xxx', 4 bytes per source character.
6313 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6314 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006315 */
6316
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006317 if (!PyUnicode_Check(unicode)) {
6318 PyErr_BadArgument();
6319 return NULL;
6320 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006321 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006322 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006323 }
Victor Stinner358af132015-10-12 22:36:57 +02006324
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006325 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006326 if (len == 0) {
6327 return PyBytes_FromStringAndSize(NULL, 0);
6328 }
6329
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006330 kind = PyUnicode_KIND(unicode);
6331 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006332 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6333 bytes, and 1 byte characters 4. */
6334 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006335 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 return PyErr_NoMemory();
6337 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006338 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 if (repr == NULL) {
6340 return NULL;
6341 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006342
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006344 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006345 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006346
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 /* U+0000-U+00ff range */
6348 if (ch < 0x100) {
6349 if (ch >= ' ' && ch < 127) {
6350 if (ch != '\\') {
6351 /* Copy printable US ASCII as-is */
6352 *p++ = (char) ch;
6353 }
6354 /* Escape backslashes */
6355 else {
6356 *p++ = '\\';
6357 *p++ = '\\';
6358 }
6359 }
Victor Stinner358af132015-10-12 22:36:57 +02006360
Victor Stinner62ec3312016-09-06 17:04:34 -07006361 /* Map special whitespace to '\t', \n', '\r' */
6362 else if (ch == '\t') {
6363 *p++ = '\\';
6364 *p++ = 't';
6365 }
6366 else if (ch == '\n') {
6367 *p++ = '\\';
6368 *p++ = 'n';
6369 }
6370 else if (ch == '\r') {
6371 *p++ = '\\';
6372 *p++ = 'r';
6373 }
6374
6375 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6376 else {
6377 *p++ = '\\';
6378 *p++ = 'x';
6379 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6380 *p++ = Py_hexdigits[ch & 0x000F];
6381 }
Tim Petersced69f82003-09-16 20:30:58 +00006382 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006383 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006384 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 *p++ = '\\';
6386 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006387 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6388 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6389 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6390 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006391 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006392 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6393 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006394
Victor Stinner62ec3312016-09-06 17:04:34 -07006395 /* Make sure that the first two digits are zero */
6396 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006397 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006398 *p++ = 'U';
6399 *p++ = '0';
6400 *p++ = '0';
6401 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6402 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6403 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6404 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6405 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6406 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006407 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006408 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006409
Victor Stinner62ec3312016-09-06 17:04:34 -07006410 assert(p - PyBytes_AS_STRING(repr) > 0);
6411 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6412 return NULL;
6413 }
6414 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006415}
6416
Alexander Belopolsky40018472011-02-26 01:02:56 +00006417PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006418PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6419 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006421 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006422 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006423 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 }
6426
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006427 result = PyUnicode_AsUnicodeEscapeString(tmp);
6428 Py_DECREF(tmp);
6429 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430}
6431
6432/* --- Raw Unicode Escape Codec ------------------------------------------- */
6433
Alexander Belopolsky40018472011-02-26 01:02:56 +00006434PyObject *
6435PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006436 Py_ssize_t size,
6437 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006438{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006439 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006440 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006441 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006442 PyObject *errorHandler = NULL;
6443 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006444
Victor Stinner62ec3312016-09-06 17:04:34 -07006445 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006446 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006447 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006448
Guido van Rossumd57fd912000-03-10 22:53:23 +00006449 /* Escaped strings will always be longer than the resulting
6450 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006451 length after conversion to the true value. (But decoding error
6452 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006453 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 writer.min_length = size;
6455 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6456 goto onError;
6457 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006458
Guido van Rossumd57fd912000-03-10 22:53:23 +00006459 end = s + size;
6460 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 unsigned char c = (unsigned char) *s++;
6462 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006463 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006464 Py_ssize_t startinpos;
6465 Py_ssize_t endinpos;
6466 const char *message;
6467
6468#define WRITE_CHAR(ch) \
6469 do { \
6470 if (ch <= writer.maxchar) { \
6471 assert(writer.pos < writer.size); \
6472 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6473 } \
6474 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6475 goto onError; \
6476 } \
6477 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006478
Benjamin Peterson29060642009-01-31 22:14:21 +00006479 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006480 if (c != '\\' || s >= end) {
6481 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006482 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006483 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006484
Victor Stinner62ec3312016-09-06 17:04:34 -07006485 c = (unsigned char) *s++;
6486 if (c == 'u') {
6487 count = 4;
6488 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006489 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006490 else if (c == 'U') {
6491 count = 8;
6492 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006493 }
6494 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006495 assert(writer.pos < writer.size);
6496 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6497 WRITE_CHAR(c);
6498 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006499 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006500 startinpos = s - starts - 2;
6501
6502 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6503 for (ch = 0; count && s < end; ++s, --count) {
6504 c = (unsigned char)*s;
6505 ch <<= 4;
6506 if (c >= '0' && c <= '9') {
6507 ch += c - '0';
6508 }
6509 else if (c >= 'a' && c <= 'f') {
6510 ch += c - ('a' - 10);
6511 }
6512 else if (c >= 'A' && c <= 'F') {
6513 ch += c - ('A' - 10);
6514 }
6515 else {
6516 break;
6517 }
6518 }
6519 if (!count) {
6520 if (ch <= MAX_UNICODE) {
6521 WRITE_CHAR(ch);
6522 continue;
6523 }
6524 message = "\\Uxxxxxxxx out of range";
6525 }
6526
6527 endinpos = s-starts;
6528 writer.min_length = end - s + writer.pos;
6529 if (unicode_decode_call_errorhandler_writer(
6530 errors, &errorHandler,
6531 "rawunicodeescape", message,
6532 &starts, &end, &startinpos, &endinpos, &exc, &s,
6533 &writer)) {
6534 goto onError;
6535 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006536 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006537
6538#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006539 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006540 Py_XDECREF(errorHandler);
6541 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006542 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006543
Benjamin Peterson29060642009-01-31 22:14:21 +00006544 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006545 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006546 Py_XDECREF(errorHandler);
6547 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006548 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006549
Guido van Rossumd57fd912000-03-10 22:53:23 +00006550}
6551
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006552
Alexander Belopolsky40018472011-02-26 01:02:56 +00006553PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006554PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006555{
Victor Stinner62ec3312016-09-06 17:04:34 -07006556 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006557 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006558 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006559 int kind;
6560 void *data;
6561 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006562
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006563 if (!PyUnicode_Check(unicode)) {
6564 PyErr_BadArgument();
6565 return NULL;
6566 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006567 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006568 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006570 kind = PyUnicode_KIND(unicode);
6571 data = PyUnicode_DATA(unicode);
6572 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006573 if (kind == PyUnicode_1BYTE_KIND) {
6574 return PyBytes_FromStringAndSize(data, len);
6575 }
Victor Stinner0e368262011-11-10 20:12:49 +01006576
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6578 bytes, and 1 byte characters 4. */
6579 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006580
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 if (len > PY_SSIZE_T_MAX / expandsize) {
6582 return PyErr_NoMemory();
6583 }
6584 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6585 if (repr == NULL) {
6586 return NULL;
6587 }
6588 if (len == 0) {
6589 return repr;
6590 }
6591
6592 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006593 for (pos = 0; pos < len; pos++) {
6594 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006595
Victor Stinner62ec3312016-09-06 17:04:34 -07006596 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6597 if (ch < 0x100) {
6598 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006599 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006600 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006601 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006602 *p++ = '\\';
6603 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006604 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6605 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6606 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6607 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006608 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006609 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6610 else {
6611 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6612 *p++ = '\\';
6613 *p++ = 'U';
6614 *p++ = '0';
6615 *p++ = '0';
6616 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6617 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6618 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6619 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6620 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6621 *p++ = Py_hexdigits[ch & 15];
6622 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006623 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006624
Victor Stinner62ec3312016-09-06 17:04:34 -07006625 assert(p > PyBytes_AS_STRING(repr));
6626 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6627 return NULL;
6628 }
6629 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006630}
6631
Alexander Belopolsky40018472011-02-26 01:02:56 +00006632PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006633PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6634 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006635{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006636 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006637 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006638 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006639 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006640 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6641 Py_DECREF(tmp);
6642 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006643}
6644
6645/* --- Latin-1 Codec ------------------------------------------------------ */
6646
Alexander Belopolsky40018472011-02-26 01:02:56 +00006647PyObject *
6648PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006649 Py_ssize_t size,
6650 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006651{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006652 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006653 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006654}
6655
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006656/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006657static void
6658make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006659 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006660 PyObject *unicode,
6661 Py_ssize_t startpos, Py_ssize_t endpos,
6662 const char *reason)
6663{
6664 if (*exceptionObject == NULL) {
6665 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006666 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006667 encoding, unicode, startpos, endpos, reason);
6668 }
6669 else {
6670 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6671 goto onError;
6672 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6673 goto onError;
6674 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6675 goto onError;
6676 return;
6677 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006678 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006679 }
6680}
6681
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006682/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006683static void
6684raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006685 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006686 PyObject *unicode,
6687 Py_ssize_t startpos, Py_ssize_t endpos,
6688 const char *reason)
6689{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006690 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006691 encoding, unicode, startpos, endpos, reason);
6692 if (*exceptionObject != NULL)
6693 PyCodec_StrictErrors(*exceptionObject);
6694}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006695
6696/* error handling callback helper:
6697 build arguments, call the callback and check the arguments,
6698 put the result into newpos and return the replacement string, which
6699 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006700static PyObject *
6701unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006702 PyObject **errorHandler,
6703 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006704 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006705 Py_ssize_t startpos, Py_ssize_t endpos,
6706 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006707{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006708 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006709 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006710 PyObject *restuple;
6711 PyObject *resunicode;
6712
6713 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006714 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006715 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006716 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006717 }
6718
Benjamin Petersonbac79492012-01-14 13:34:47 -05006719 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006720 return NULL;
6721 len = PyUnicode_GET_LENGTH(unicode);
6722
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006723 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006725 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006727
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006728 restuple = PyObject_CallFunctionObjArgs(
6729 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006730 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006731 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006732 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006733 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006734 Py_DECREF(restuple);
6735 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006737 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 &resunicode, newpos)) {
6739 Py_DECREF(restuple);
6740 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006741 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006742 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6743 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6744 Py_DECREF(restuple);
6745 return NULL;
6746 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006747 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006748 *newpos = len + *newpos;
6749 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006750 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006751 Py_DECREF(restuple);
6752 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006753 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006754 Py_INCREF(resunicode);
6755 Py_DECREF(restuple);
6756 return resunicode;
6757}
6758
Alexander Belopolsky40018472011-02-26 01:02:56 +00006759static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006760unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006761 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006762 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006763{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764 /* input state */
6765 Py_ssize_t pos=0, size;
6766 int kind;
6767 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006768 /* pointer into the output */
6769 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006770 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6771 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006772 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006773 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006774 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006775 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006776 /* output object */
6777 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778
Benjamin Petersonbac79492012-01-14 13:34:47 -05006779 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006780 return NULL;
6781 size = PyUnicode_GET_LENGTH(unicode);
6782 kind = PyUnicode_KIND(unicode);
6783 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006784 /* allocate enough for a simple encoding without
6785 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006786 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006787 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006788
6789 _PyBytesWriter_Init(&writer);
6790 str = _PyBytesWriter_Alloc(&writer, size);
6791 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006792 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006793
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006794 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006795 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006796
Benjamin Peterson29060642009-01-31 22:14:21 +00006797 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006798 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006799 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006800 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006801 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006802 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006804 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006805 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006806 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006807 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006808 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006809
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006810 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006811 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006812
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006813 /* Only overallocate the buffer if it's not the last write */
6814 writer.overallocate = (collend < size);
6815
Benjamin Peterson29060642009-01-31 22:14:21 +00006816 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006817 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006818 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006819
6820 switch (error_handler) {
6821 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006822 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006823 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006824
6825 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006826 memset(str, '?', collend - collstart);
6827 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006828 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006829 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006830 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006831 break;
Victor Stinner50149202015-09-22 00:26:54 +02006832
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006833 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006834 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006835 writer.min_size -= (collend - collstart);
6836 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006837 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006838 if (str == NULL)
6839 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006840 pos = collend;
6841 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006842
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006843 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006844 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006845 writer.min_size -= (collend - collstart);
6846 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006847 unicode, collstart, collend);
6848 if (str == NULL)
6849 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006850 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006851 break;
Victor Stinner50149202015-09-22 00:26:54 +02006852
Victor Stinnerc3713e92015-09-29 12:32:13 +02006853 case _Py_ERROR_SURROGATEESCAPE:
6854 for (i = collstart; i < collend; ++i) {
6855 ch = PyUnicode_READ(kind, data, i);
6856 if (ch < 0xdc80 || 0xdcff < ch) {
6857 /* Not a UTF-8b surrogate */
6858 break;
6859 }
6860 *str++ = (char)(ch - 0xdc00);
6861 ++pos;
6862 }
6863 if (i >= collend)
6864 break;
6865 collstart = pos;
6866 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006867 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006868
Benjamin Peterson29060642009-01-31 22:14:21 +00006869 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006870 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6871 encoding, reason, unicode, &exc,
6872 collstart, collend, &newpos);
6873 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006874 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006875
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006876 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006877 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006878
Victor Stinner6bd525b2015-10-09 13:10:05 +02006879 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006880 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006881 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006882 PyBytes_AS_STRING(rep),
6883 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006884 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006885 else {
6886 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006887
Victor Stinner6bd525b2015-10-09 13:10:05 +02006888 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006889 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006890
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006891 if (limit == 256 ?
6892 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6893 !PyUnicode_IS_ASCII(rep))
6894 {
6895 /* Not all characters are smaller than limit */
6896 raise_encode_exception(&exc, encoding, unicode,
6897 collstart, collend, reason);
6898 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006899 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006900 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6901 str = _PyBytesWriter_WriteBytes(&writer, str,
6902 PyUnicode_DATA(rep),
6903 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006904 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006905 if (str == NULL)
6906 goto onError;
6907
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006908 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006909 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006910 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006911
6912 /* If overallocation was disabled, ensure that it was the last
6913 write. Otherwise, we missed an optimization */
6914 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006915 }
6916 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006917
Victor Stinner50149202015-09-22 00:26:54 +02006918 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006919 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006920 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006921
6922 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006923 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006924 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006925 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006926 Py_XDECREF(exc);
6927 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006928}
6929
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006930/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006931PyObject *
6932PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006933 Py_ssize_t size,
6934 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006935{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006936 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006937 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006938 if (unicode == NULL)
6939 return NULL;
6940 result = unicode_encode_ucs1(unicode, errors, 256);
6941 Py_DECREF(unicode);
6942 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006943}
6944
Alexander Belopolsky40018472011-02-26 01:02:56 +00006945PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006946_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947{
6948 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006949 PyErr_BadArgument();
6950 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006952 if (PyUnicode_READY(unicode) == -1)
6953 return NULL;
6954 /* Fast path: if it is a one-byte string, construct
6955 bytes object directly. */
6956 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6957 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6958 PyUnicode_GET_LENGTH(unicode));
6959 /* Non-Latin-1 characters present. Defer to above function to
6960 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006961 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006962}
6963
6964PyObject*
6965PyUnicode_AsLatin1String(PyObject *unicode)
6966{
6967 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006968}
6969
6970/* --- 7-bit ASCII Codec -------------------------------------------------- */
6971
Alexander Belopolsky40018472011-02-26 01:02:56 +00006972PyObject *
6973PyUnicode_DecodeASCII(const char *s,
6974 Py_ssize_t size,
6975 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006976{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006977 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006978 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006979 int kind;
6980 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006981 Py_ssize_t startinpos;
6982 Py_ssize_t endinpos;
6983 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006984 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006985 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006986 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006987 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006988
Guido van Rossumd57fd912000-03-10 22:53:23 +00006989 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006990 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006991
Guido van Rossumd57fd912000-03-10 22:53:23 +00006992 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006993 if (size == 1 && (unsigned char)s[0] < 128)
6994 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006995
Victor Stinner8f674cc2013-04-17 23:02:17 +02006996 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02006997 writer.min_length = size;
6998 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02006999 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007000
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007001 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007002 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007003 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007004 writer.pos = outpos;
7005 if (writer.pos == size)
7006 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007007
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 s += writer.pos;
7009 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007010 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007011 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007012 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007013 PyUnicode_WRITE(kind, data, writer.pos, c);
7014 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007015 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007016 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007017 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007018
7019 /* byte outsize range 0x00..0x7f: call the error handler */
7020
7021 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007022 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007023
7024 switch (error_handler)
7025 {
7026 case _Py_ERROR_REPLACE:
7027 case _Py_ERROR_SURROGATEESCAPE:
7028 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007029 but we may switch to UCS2 at the first write */
7030 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7031 goto onError;
7032 kind = writer.kind;
7033 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007034
7035 if (error_handler == _Py_ERROR_REPLACE)
7036 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7037 else
7038 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7039 writer.pos++;
7040 ++s;
7041 break;
7042
7043 case _Py_ERROR_IGNORE:
7044 ++s;
7045 break;
7046
7047 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007048 startinpos = s-starts;
7049 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007050 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007051 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 "ascii", "ordinal not in range(128)",
7053 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007054 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007055 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007056 kind = writer.kind;
7057 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007058 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007059 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007060 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007061 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007062 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007063
Benjamin Peterson29060642009-01-31 22:14:21 +00007064 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007065 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007066 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007067 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068 return NULL;
7069}
7070
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007071/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007072PyObject *
7073PyUnicode_EncodeASCII(const Py_UNICODE *p,
7074 Py_ssize_t size,
7075 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007076{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007077 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007078 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007079 if (unicode == NULL)
7080 return NULL;
7081 result = unicode_encode_ucs1(unicode, errors, 128);
7082 Py_DECREF(unicode);
7083 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007084}
7085
Alexander Belopolsky40018472011-02-26 01:02:56 +00007086PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007087_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088{
7089 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007090 PyErr_BadArgument();
7091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007093 if (PyUnicode_READY(unicode) == -1)
7094 return NULL;
7095 /* Fast path: if it is an ASCII-only string, construct bytes object
7096 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007097 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007098 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7099 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007100 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007101}
7102
7103PyObject *
7104PyUnicode_AsASCIIString(PyObject *unicode)
7105{
7106 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107}
7108
Steve Dowercc16be82016-09-08 10:35:16 -07007109#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007110
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007111/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007112
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007113#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007114#define NEED_RETRY
7115#endif
7116
Victor Stinner3a50e702011-10-18 21:21:00 +02007117#ifndef WC_ERR_INVALID_CHARS
7118# define WC_ERR_INVALID_CHARS 0x0080
7119#endif
7120
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007121static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007122code_page_name(UINT code_page, PyObject **obj)
7123{
7124 *obj = NULL;
7125 if (code_page == CP_ACP)
7126 return "mbcs";
7127 if (code_page == CP_UTF7)
7128 return "CP_UTF7";
7129 if (code_page == CP_UTF8)
7130 return "CP_UTF8";
7131
7132 *obj = PyBytes_FromFormat("cp%u", code_page);
7133 if (*obj == NULL)
7134 return NULL;
7135 return PyBytes_AS_STRING(*obj);
7136}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007137
Victor Stinner3a50e702011-10-18 21:21:00 +02007138static DWORD
7139decode_code_page_flags(UINT code_page)
7140{
7141 if (code_page == CP_UTF7) {
7142 /* The CP_UTF7 decoder only supports flags=0 */
7143 return 0;
7144 }
7145 else
7146 return MB_ERR_INVALID_CHARS;
7147}
7148
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007149/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007150 * Decode a byte string from a Windows code page into unicode object in strict
7151 * mode.
7152 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007153 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7154 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007155 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007156static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007157decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007158 wchar_t **buf,
7159 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 const char *in,
7161 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007162{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007163 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007164 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007165 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007166
7167 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007168 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007169 while ((outsize = MultiByteToWideChar(code_page, flags,
7170 in, insize, NULL, 0)) <= 0)
7171 {
7172 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7173 goto error;
7174 }
7175 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7176 flags = 0;
7177 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007178
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007179 /* Extend a wchar_t* buffer */
7180 Py_ssize_t n = *bufsize; /* Get the current length */
7181 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7182 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007183 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007184 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007185
7186 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007187 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7188 if (outsize <= 0)
7189 goto error;
7190 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007191
Victor Stinner3a50e702011-10-18 21:21:00 +02007192error:
7193 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7194 return -2;
7195 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007196 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007197}
7198
Victor Stinner3a50e702011-10-18 21:21:00 +02007199/*
7200 * Decode a byte string from a code page into unicode object with an error
7201 * handler.
7202 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007203 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007204 * UnicodeDecodeError exception and returns -1 on error.
7205 */
7206static int
7207decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007208 wchar_t **buf,
7209 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007210 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007211 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007212{
7213 const char *startin = in;
7214 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007215 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007216 /* Ideally, we should get reason from FormatMessage. This is the Windows
7217 2000 English version of the message. */
7218 const char *reason = "No mapping for the Unicode character exists "
7219 "in the target code page.";
7220 /* each step cannot decode more than 1 character, but a character can be
7221 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007222 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007223 int insize;
7224 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007225 PyObject *errorHandler = NULL;
7226 PyObject *exc = NULL;
7227 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007228 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007229 DWORD err;
7230 int ret = -1;
7231
7232 assert(size > 0);
7233
7234 encoding = code_page_name(code_page, &encoding_obj);
7235 if (encoding == NULL)
7236 return -1;
7237
Victor Stinner7d00cc12014-03-17 23:08:06 +01007238 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7240 UnicodeDecodeError. */
7241 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7242 if (exc != NULL) {
7243 PyCodec_StrictErrors(exc);
7244 Py_CLEAR(exc);
7245 }
7246 goto error;
7247 }
7248
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007249 /* Extend a wchar_t* buffer */
7250 Py_ssize_t n = *bufsize; /* Get the current length */
7251 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7252 PyErr_NoMemory();
7253 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007254 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007255 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7256 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007257 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007258 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007259
7260 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007261 while (in < endin)
7262 {
7263 /* Decode a character */
7264 insize = 1;
7265 do
7266 {
7267 outsize = MultiByteToWideChar(code_page, flags,
7268 in, insize,
7269 buffer, Py_ARRAY_LENGTH(buffer));
7270 if (outsize > 0)
7271 break;
7272 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007273 if (err == ERROR_INVALID_FLAGS && flags) {
7274 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7275 flags = 0;
7276 continue;
7277 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007278 if (err != ERROR_NO_UNICODE_TRANSLATION
7279 && err != ERROR_INSUFFICIENT_BUFFER)
7280 {
7281 PyErr_SetFromWindowsErr(0);
7282 goto error;
7283 }
7284 insize++;
7285 }
7286 /* 4=maximum length of a UTF-8 sequence */
7287 while (insize <= 4 && (in + insize) <= endin);
7288
7289 if (outsize <= 0) {
7290 Py_ssize_t startinpos, endinpos, outpos;
7291
Victor Stinner7d00cc12014-03-17 23:08:06 +01007292 /* last character in partial decode? */
7293 if (in + insize >= endin && !final)
7294 break;
7295
Victor Stinner3a50e702011-10-18 21:21:00 +02007296 startinpos = in - startin;
7297 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007298 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007299 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007300 errors, &errorHandler,
7301 encoding, reason,
7302 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007303 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007304 {
7305 goto error;
7306 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007307 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007308 }
7309 else {
7310 in += insize;
7311 memcpy(out, buffer, outsize * sizeof(wchar_t));
7312 out += outsize;
7313 }
7314 }
7315
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007316 /* Shrink the buffer */
7317 assert(out - *buf <= *bufsize);
7318 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007319 /* (in - startin) <= size and size is an int */
7320 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007321
7322error:
7323 Py_XDECREF(encoding_obj);
7324 Py_XDECREF(errorHandler);
7325 Py_XDECREF(exc);
7326 return ret;
7327}
7328
Victor Stinner3a50e702011-10-18 21:21:00 +02007329static PyObject *
7330decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007331 const char *s, Py_ssize_t size,
7332 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007333{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007334 wchar_t *buf = NULL;
7335 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007337
Victor Stinner3a50e702011-10-18 21:21:00 +02007338 if (code_page < 0) {
7339 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7340 return NULL;
7341 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007342 if (size < 0) {
7343 PyErr_BadInternalCall();
7344 return NULL;
7345 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007346
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007348 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007349
Victor Stinner76a31a62011-11-04 00:05:13 +01007350 do
7351 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007352#ifdef NEED_RETRY
Victor Stinner76a31a62011-11-04 00:05:13 +01007353 if (size > INT_MAX) {
7354 chunk_size = INT_MAX;
7355 final = 0;
7356 done = 0;
7357 }
7358 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007360 {
7361 chunk_size = (int)size;
7362 final = (consumed == NULL);
7363 done = 1;
7364 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007365
Victor Stinner76a31a62011-11-04 00:05:13 +01007366 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007367 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007368 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007369 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007370 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007371
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007372 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007373 s, chunk_size);
7374 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007375 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007377 errors, final);
7378 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007379
7380 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007381 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007382 return NULL;
7383 }
7384
7385 if (consumed)
7386 *consumed += converted;
7387
7388 s += converted;
7389 size -= converted;
7390 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007391
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007392 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7393 PyMem_Free(buf);
7394 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007395}
7396
Alexander Belopolsky40018472011-02-26 01:02:56 +00007397PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007398PyUnicode_DecodeCodePageStateful(int code_page,
7399 const char *s,
7400 Py_ssize_t size,
7401 const char *errors,
7402 Py_ssize_t *consumed)
7403{
7404 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7405}
7406
7407PyObject *
7408PyUnicode_DecodeMBCSStateful(const char *s,
7409 Py_ssize_t size,
7410 const char *errors,
7411 Py_ssize_t *consumed)
7412{
7413 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7414}
7415
7416PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007417PyUnicode_DecodeMBCS(const char *s,
7418 Py_ssize_t size,
7419 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007420{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007421 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7422}
7423
Victor Stinner3a50e702011-10-18 21:21:00 +02007424static DWORD
7425encode_code_page_flags(UINT code_page, const char *errors)
7426{
7427 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007428 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007429 }
7430 else if (code_page == CP_UTF7) {
7431 /* CP_UTF7 only supports flags=0 */
7432 return 0;
7433 }
7434 else {
7435 if (errors != NULL && strcmp(errors, "replace") == 0)
7436 return 0;
7437 else
7438 return WC_NO_BEST_FIT_CHARS;
7439 }
7440}
7441
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007442/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007443 * Encode a Unicode string to a Windows code page into a byte string in strict
7444 * mode.
7445 *
7446 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007447 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007448 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007449static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007450encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007451 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007452 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007453{
Victor Stinner554f3f02010-06-16 23:33:54 +00007454 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007455 BOOL *pusedDefaultChar = &usedDefaultChar;
7456 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007457 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007458 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007459 const DWORD flags = encode_code_page_flags(code_page, NULL);
7460 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007461 /* Create a substring so that we can get the UTF-16 representation
7462 of just the slice under consideration. */
7463 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007464
Martin v. Löwis3d325192011-11-04 18:23:06 +01007465 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007466
Victor Stinner3a50e702011-10-18 21:21:00 +02007467 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007468 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007470 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007471
Victor Stinner2fc507f2011-11-04 20:06:39 +01007472 substring = PyUnicode_Substring(unicode, offset, offset+len);
7473 if (substring == NULL)
7474 return -1;
7475 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7476 if (p == NULL) {
7477 Py_DECREF(substring);
7478 return -1;
7479 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007480 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007481
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007482 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007483 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007484 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007485 NULL, 0,
7486 NULL, pusedDefaultChar);
7487 if (outsize <= 0)
7488 goto error;
7489 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007490 if (pusedDefaultChar && *pusedDefaultChar) {
7491 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007492 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007493 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007494
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007496 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007497 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007498 if (*outbytes == NULL) {
7499 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007500 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007501 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007503 }
7504 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007505 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007506 const Py_ssize_t n = PyBytes_Size(*outbytes);
7507 if (outsize > PY_SSIZE_T_MAX - n) {
7508 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007509 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007511 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007512 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7513 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007514 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007515 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007517 }
7518
7519 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007520 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007521 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007522 out, outsize,
7523 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007524 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007525 if (outsize <= 0)
7526 goto error;
7527 if (pusedDefaultChar && *pusedDefaultChar)
7528 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007529 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007530
Victor Stinner3a50e702011-10-18 21:21:00 +02007531error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007532 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007533 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7534 return -2;
7535 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007536 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007537}
7538
Victor Stinner3a50e702011-10-18 21:21:00 +02007539/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007540 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007541 * error handler.
7542 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007543 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007544 * -1 on other error.
7545 */
7546static int
7547encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007548 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007549 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007550{
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007552 Py_ssize_t pos = unicode_offset;
7553 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 /* Ideally, we should get reason from FormatMessage. This is the Windows
7555 2000 English version of the message. */
7556 const char *reason = "invalid character";
7557 /* 4=maximum length of a UTF-8 sequence */
7558 char buffer[4];
7559 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7560 Py_ssize_t outsize;
7561 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007562 PyObject *errorHandler = NULL;
7563 PyObject *exc = NULL;
7564 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007565 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007566 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007567 PyObject *rep;
7568 int ret = -1;
7569
7570 assert(insize > 0);
7571
7572 encoding = code_page_name(code_page, &encoding_obj);
7573 if (encoding == NULL)
7574 return -1;
7575
7576 if (errors == NULL || strcmp(errors, "strict") == 0) {
7577 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7578 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007579 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007580 if (exc != NULL) {
7581 PyCodec_StrictErrors(exc);
7582 Py_DECREF(exc);
7583 }
7584 Py_XDECREF(encoding_obj);
7585 return -1;
7586 }
7587
7588 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7589 pusedDefaultChar = &usedDefaultChar;
7590 else
7591 pusedDefaultChar = NULL;
7592
7593 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7594 PyErr_NoMemory();
7595 goto error;
7596 }
7597 outsize = insize * Py_ARRAY_LENGTH(buffer);
7598
7599 if (*outbytes == NULL) {
7600 /* Create string object */
7601 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7602 if (*outbytes == NULL)
7603 goto error;
7604 out = PyBytes_AS_STRING(*outbytes);
7605 }
7606 else {
7607 /* Extend string object */
7608 Py_ssize_t n = PyBytes_Size(*outbytes);
7609 if (n > PY_SSIZE_T_MAX - outsize) {
7610 PyErr_NoMemory();
7611 goto error;
7612 }
7613 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7614 goto error;
7615 out = PyBytes_AS_STRING(*outbytes) + n;
7616 }
7617
7618 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007619 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007620 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007621 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7622 wchar_t chars[2];
7623 int charsize;
7624 if (ch < 0x10000) {
7625 chars[0] = (wchar_t)ch;
7626 charsize = 1;
7627 }
7628 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007629 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7630 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007631 charsize = 2;
7632 }
7633
Victor Stinner3a50e702011-10-18 21:21:00 +02007634 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007635 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007636 buffer, Py_ARRAY_LENGTH(buffer),
7637 NULL, pusedDefaultChar);
7638 if (outsize > 0) {
7639 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7640 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007641 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 memcpy(out, buffer, outsize);
7643 out += outsize;
7644 continue;
7645 }
7646 }
7647 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7648 PyErr_SetFromWindowsErr(0);
7649 goto error;
7650 }
7651
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 rep = unicode_encode_call_errorhandler(
7653 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007654 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007655 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007656 if (rep == NULL)
7657 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007658 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007659
7660 if (PyBytes_Check(rep)) {
7661 outsize = PyBytes_GET_SIZE(rep);
7662 if (outsize != 1) {
7663 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7664 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7665 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7666 Py_DECREF(rep);
7667 goto error;
7668 }
7669 out = PyBytes_AS_STRING(*outbytes) + offset;
7670 }
7671 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7672 out += outsize;
7673 }
7674 else {
7675 Py_ssize_t i;
7676 enum PyUnicode_Kind kind;
7677 void *data;
7678
Benjamin Petersonbac79492012-01-14 13:34:47 -05007679 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 Py_DECREF(rep);
7681 goto error;
7682 }
7683
7684 outsize = PyUnicode_GET_LENGTH(rep);
7685 if (outsize != 1) {
7686 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7687 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7688 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7689 Py_DECREF(rep);
7690 goto error;
7691 }
7692 out = PyBytes_AS_STRING(*outbytes) + offset;
7693 }
7694 kind = PyUnicode_KIND(rep);
7695 data = PyUnicode_DATA(rep);
7696 for (i=0; i < outsize; i++) {
7697 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7698 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007699 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007700 encoding, unicode,
7701 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007702 "unable to encode error handler result to ASCII");
7703 Py_DECREF(rep);
7704 goto error;
7705 }
7706 *out = (unsigned char)ch;
7707 out++;
7708 }
7709 }
7710 Py_DECREF(rep);
7711 }
7712 /* write a NUL byte */
7713 *out = 0;
7714 outsize = out - PyBytes_AS_STRING(*outbytes);
7715 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7716 if (_PyBytes_Resize(outbytes, outsize) < 0)
7717 goto error;
7718 ret = 0;
7719
7720error:
7721 Py_XDECREF(encoding_obj);
7722 Py_XDECREF(errorHandler);
7723 Py_XDECREF(exc);
7724 return ret;
7725}
7726
Victor Stinner3a50e702011-10-18 21:21:00 +02007727static PyObject *
7728encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007729 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007730 const char *errors)
7731{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007732 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007733 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007734 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007735 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007736
Victor Stinner29dacf22015-01-26 16:41:32 +01007737 if (!PyUnicode_Check(unicode)) {
7738 PyErr_BadArgument();
7739 return NULL;
7740 }
7741
Benjamin Petersonbac79492012-01-14 13:34:47 -05007742 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007743 return NULL;
7744 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007745
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 if (code_page < 0) {
7747 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7748 return NULL;
7749 }
7750
Martin v. Löwis3d325192011-11-04 18:23:06 +01007751 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007752 return PyBytes_FromStringAndSize(NULL, 0);
7753
Victor Stinner7581cef2011-11-03 22:32:33 +01007754 offset = 0;
7755 do
7756 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007757#ifdef NEED_RETRY
Victor Stinner2fc507f2011-11-04 20:06:39 +01007758 /* UTF-16 encoding may double the size, so use only INT_MAX/2
Martin v. Löwis3d325192011-11-04 18:23:06 +01007759 chunks. */
7760 if (len > INT_MAX/2) {
7761 chunk_len = INT_MAX/2;
Victor Stinner76a31a62011-11-04 00:05:13 +01007762 done = 0;
7763 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007765#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007766 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007767 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007768 done = 1;
7769 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007770
Victor Stinner76a31a62011-11-04 00:05:13 +01007771 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007772 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007773 errors);
7774 if (ret == -2)
7775 ret = encode_code_page_errors(code_page, &outbytes,
7776 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007777 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007778 if (ret < 0) {
7779 Py_XDECREF(outbytes);
7780 return NULL;
7781 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007782
Victor Stinner7581cef2011-11-03 22:32:33 +01007783 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007785 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007786
Victor Stinner3a50e702011-10-18 21:21:00 +02007787 return outbytes;
7788}
7789
7790PyObject *
7791PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7792 Py_ssize_t size,
7793 const char *errors)
7794{
Victor Stinner7581cef2011-11-03 22:32:33 +01007795 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007796 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007797 if (unicode == NULL)
7798 return NULL;
7799 res = encode_code_page(CP_ACP, unicode, errors);
7800 Py_DECREF(unicode);
7801 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007802}
7803
7804PyObject *
7805PyUnicode_EncodeCodePage(int code_page,
7806 PyObject *unicode,
7807 const char *errors)
7808{
Victor Stinner7581cef2011-11-03 22:32:33 +01007809 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007810}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007811
Alexander Belopolsky40018472011-02-26 01:02:56 +00007812PyObject *
7813PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007814{
Victor Stinner7581cef2011-11-03 22:32:33 +01007815 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007816}
7817
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007818#undef NEED_RETRY
7819
Steve Dowercc16be82016-09-08 10:35:16 -07007820#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007821
Guido van Rossumd57fd912000-03-10 22:53:23 +00007822/* --- Character Mapping Codec -------------------------------------------- */
7823
Victor Stinnerfb161b12013-04-18 01:44:27 +02007824static int
7825charmap_decode_string(const char *s,
7826 Py_ssize_t size,
7827 PyObject *mapping,
7828 const char *errors,
7829 _PyUnicodeWriter *writer)
7830{
7831 const char *starts = s;
7832 const char *e;
7833 Py_ssize_t startinpos, endinpos;
7834 PyObject *errorHandler = NULL, *exc = NULL;
7835 Py_ssize_t maplen;
7836 enum PyUnicode_Kind mapkind;
7837 void *mapdata;
7838 Py_UCS4 x;
7839 unsigned char ch;
7840
7841 if (PyUnicode_READY(mapping) == -1)
7842 return -1;
7843
7844 maplen = PyUnicode_GET_LENGTH(mapping);
7845 mapdata = PyUnicode_DATA(mapping);
7846 mapkind = PyUnicode_KIND(mapping);
7847
7848 e = s + size;
7849
7850 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7851 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7852 * is disabled in encoding aliases, latin1 is preferred because
7853 * its implementation is faster. */
7854 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7855 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7856 Py_UCS4 maxchar = writer->maxchar;
7857
7858 assert (writer->kind == PyUnicode_1BYTE_KIND);
7859 while (s < e) {
7860 ch = *s;
7861 x = mapdata_ucs1[ch];
7862 if (x > maxchar) {
7863 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7864 goto onError;
7865 maxchar = writer->maxchar;
7866 outdata = (Py_UCS1 *)writer->data;
7867 }
7868 outdata[writer->pos] = x;
7869 writer->pos++;
7870 ++s;
7871 }
7872 return 0;
7873 }
7874
7875 while (s < e) {
7876 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7877 enum PyUnicode_Kind outkind = writer->kind;
7878 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7879 if (outkind == PyUnicode_1BYTE_KIND) {
7880 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7881 Py_UCS4 maxchar = writer->maxchar;
7882 while (s < e) {
7883 ch = *s;
7884 x = mapdata_ucs2[ch];
7885 if (x > maxchar)
7886 goto Error;
7887 outdata[writer->pos] = x;
7888 writer->pos++;
7889 ++s;
7890 }
7891 break;
7892 }
7893 else if (outkind == PyUnicode_2BYTE_KIND) {
7894 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7895 while (s < e) {
7896 ch = *s;
7897 x = mapdata_ucs2[ch];
7898 if (x == 0xFFFE)
7899 goto Error;
7900 outdata[writer->pos] = x;
7901 writer->pos++;
7902 ++s;
7903 }
7904 break;
7905 }
7906 }
7907 ch = *s;
7908
7909 if (ch < maplen)
7910 x = PyUnicode_READ(mapkind, mapdata, ch);
7911 else
7912 x = 0xfffe; /* invalid value */
7913Error:
7914 if (x == 0xfffe)
7915 {
7916 /* undefined mapping */
7917 startinpos = s-starts;
7918 endinpos = startinpos+1;
7919 if (unicode_decode_call_errorhandler_writer(
7920 errors, &errorHandler,
7921 "charmap", "character maps to <undefined>",
7922 &starts, &e, &startinpos, &endinpos, &exc, &s,
7923 writer)) {
7924 goto onError;
7925 }
7926 continue;
7927 }
7928
7929 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7930 goto onError;
7931 ++s;
7932 }
7933 Py_XDECREF(errorHandler);
7934 Py_XDECREF(exc);
7935 return 0;
7936
7937onError:
7938 Py_XDECREF(errorHandler);
7939 Py_XDECREF(exc);
7940 return -1;
7941}
7942
7943static int
7944charmap_decode_mapping(const char *s,
7945 Py_ssize_t size,
7946 PyObject *mapping,
7947 const char *errors,
7948 _PyUnicodeWriter *writer)
7949{
7950 const char *starts = s;
7951 const char *e;
7952 Py_ssize_t startinpos, endinpos;
7953 PyObject *errorHandler = NULL, *exc = NULL;
7954 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007955 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007956
7957 e = s + size;
7958
7959 while (s < e) {
7960 ch = *s;
7961
7962 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7963 key = PyLong_FromLong((long)ch);
7964 if (key == NULL)
7965 goto onError;
7966
7967 item = PyObject_GetItem(mapping, key);
7968 Py_DECREF(key);
7969 if (item == NULL) {
7970 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7971 /* No mapping found means: mapping is undefined. */
7972 PyErr_Clear();
7973 goto Undefined;
7974 } else
7975 goto onError;
7976 }
7977
7978 /* Apply mapping */
7979 if (item == Py_None)
7980 goto Undefined;
7981 if (PyLong_Check(item)) {
7982 long value = PyLong_AS_LONG(item);
7983 if (value == 0xFFFE)
7984 goto Undefined;
7985 if (value < 0 || value > MAX_UNICODE) {
7986 PyErr_Format(PyExc_TypeError,
7987 "character mapping must be in range(0x%lx)",
7988 (unsigned long)MAX_UNICODE + 1);
7989 goto onError;
7990 }
7991
7992 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7993 goto onError;
7994 }
7995 else if (PyUnicode_Check(item)) {
7996 if (PyUnicode_READY(item) == -1)
7997 goto onError;
7998 if (PyUnicode_GET_LENGTH(item) == 1) {
7999 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8000 if (value == 0xFFFE)
8001 goto Undefined;
8002 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8003 goto onError;
8004 }
8005 else {
8006 writer->overallocate = 1;
8007 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8008 goto onError;
8009 }
8010 }
8011 else {
8012 /* wrong return value */
8013 PyErr_SetString(PyExc_TypeError,
8014 "character mapping must return integer, None or str");
8015 goto onError;
8016 }
8017 Py_CLEAR(item);
8018 ++s;
8019 continue;
8020
8021Undefined:
8022 /* undefined mapping */
8023 Py_CLEAR(item);
8024 startinpos = s-starts;
8025 endinpos = startinpos+1;
8026 if (unicode_decode_call_errorhandler_writer(
8027 errors, &errorHandler,
8028 "charmap", "character maps to <undefined>",
8029 &starts, &e, &startinpos, &endinpos, &exc, &s,
8030 writer)) {
8031 goto onError;
8032 }
8033 }
8034 Py_XDECREF(errorHandler);
8035 Py_XDECREF(exc);
8036 return 0;
8037
8038onError:
8039 Py_XDECREF(item);
8040 Py_XDECREF(errorHandler);
8041 Py_XDECREF(exc);
8042 return -1;
8043}
8044
Alexander Belopolsky40018472011-02-26 01:02:56 +00008045PyObject *
8046PyUnicode_DecodeCharmap(const char *s,
8047 Py_ssize_t size,
8048 PyObject *mapping,
8049 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008050{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008051 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008052
Guido van Rossumd57fd912000-03-10 22:53:23 +00008053 /* Default to Latin-1 */
8054 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008055 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008056
Guido van Rossumd57fd912000-03-10 22:53:23 +00008057 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008058 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008059 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008060 writer.min_length = size;
8061 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008062 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008063
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008064 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008065 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8066 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008067 }
8068 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008069 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8070 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008071 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008072 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008073
Benjamin Peterson29060642009-01-31 22:14:21 +00008074 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008075 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008076 return NULL;
8077}
8078
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008079/* Charmap encoding: the lookup table */
8080
Alexander Belopolsky40018472011-02-26 01:02:56 +00008081struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 PyObject_HEAD
8083 unsigned char level1[32];
8084 int count2, count3;
8085 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008086};
8087
8088static PyObject*
8089encoding_map_size(PyObject *obj, PyObject* args)
8090{
8091 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008092 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008093 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094}
8095
8096static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008097 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008098 PyDoc_STR("Return the size (in bytes) of this object") },
8099 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008100};
8101
8102static void
8103encoding_map_dealloc(PyObject* o)
8104{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105 PyObject_FREE(o);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008106}
8107
8108static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008109 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008110 "EncodingMap", /*tp_name*/
8111 sizeof(struct encoding_map), /*tp_basicsize*/
8112 0, /*tp_itemsize*/
8113 /* methods */
8114 encoding_map_dealloc, /*tp_dealloc*/
8115 0, /*tp_print*/
8116 0, /*tp_getattr*/
8117 0, /*tp_setattr*/
Mark Dickinsone94c6792009-02-02 20:36:42 +00008118 0, /*tp_reserved*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008119 0, /*tp_repr*/
8120 0, /*tp_as_number*/
8121 0, /*tp_as_sequence*/
8122 0, /*tp_as_mapping*/
8123 0, /*tp_hash*/
8124 0, /*tp_call*/
8125 0, /*tp_str*/
8126 0, /*tp_getattro*/
8127 0, /*tp_setattro*/
8128 0, /*tp_as_buffer*/
8129 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8130 0, /*tp_doc*/
8131 0, /*tp_traverse*/
8132 0, /*tp_clear*/
8133 0, /*tp_richcompare*/
8134 0, /*tp_weaklistoffset*/
8135 0, /*tp_iter*/
8136 0, /*tp_iternext*/
8137 encoding_map_methods, /*tp_methods*/
8138 0, /*tp_members*/
8139 0, /*tp_getset*/
8140 0, /*tp_base*/
8141 0, /*tp_dict*/
8142 0, /*tp_descr_get*/
8143 0, /*tp_descr_set*/
8144 0, /*tp_dictoffset*/
8145 0, /*tp_init*/
8146 0, /*tp_alloc*/
8147 0, /*tp_new*/
8148 0, /*tp_free*/
8149 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008150};
8151
8152PyObject*
8153PyUnicode_BuildEncodingMap(PyObject* string)
8154{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008155 PyObject *result;
8156 struct encoding_map *mresult;
8157 int i;
8158 int need_dict = 0;
8159 unsigned char level1[32];
8160 unsigned char level2[512];
8161 unsigned char *mlevel1, *mlevel2, *mlevel3;
8162 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008163 int kind;
8164 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008165 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008166 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008167
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008168 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169 PyErr_BadArgument();
8170 return NULL;
8171 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008172 kind = PyUnicode_KIND(string);
8173 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008174 length = PyUnicode_GET_LENGTH(string);
8175 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008176 memset(level1, 0xFF, sizeof level1);
8177 memset(level2, 0xFF, sizeof level2);
8178
8179 /* If there isn't a one-to-one mapping of NULL to \0,
8180 or if there are non-BMP characters, we need to use
8181 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008182 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008183 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008184 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008186 ch = PyUnicode_READ(kind, data, i);
8187 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008188 need_dict = 1;
8189 break;
8190 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008191 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008192 /* unmapped character */
8193 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008194 l1 = ch >> 11;
8195 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008196 if (level1[l1] == 0xFF)
8197 level1[l1] = count2++;
8198 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008199 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008200 }
8201
8202 if (count2 >= 0xFF || count3 >= 0xFF)
8203 need_dict = 1;
8204
8205 if (need_dict) {
8206 PyObject *result = PyDict_New();
8207 PyObject *key, *value;
8208 if (!result)
8209 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008210 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008211 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008212 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008213 if (!key || !value)
8214 goto failed1;
8215 if (PyDict_SetItem(result, key, value) == -1)
8216 goto failed1;
8217 Py_DECREF(key);
8218 Py_DECREF(value);
8219 }
8220 return result;
8221 failed1:
8222 Py_XDECREF(key);
8223 Py_XDECREF(value);
8224 Py_DECREF(result);
8225 return NULL;
8226 }
8227
8228 /* Create a three-level trie */
8229 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8230 16*count2 + 128*count3 - 1);
8231 if (!result)
8232 return PyErr_NoMemory();
8233 PyObject_Init(result, &EncodingMapType);
8234 mresult = (struct encoding_map*)result;
8235 mresult->count2 = count2;
8236 mresult->count3 = count3;
8237 mlevel1 = mresult->level1;
8238 mlevel2 = mresult->level23;
8239 mlevel3 = mresult->level23 + 16*count2;
8240 memcpy(mlevel1, level1, 32);
8241 memset(mlevel2, 0xFF, 16*count2);
8242 memset(mlevel3, 0, 128*count3);
8243 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008244 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008245 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008246 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8247 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008248 /* unmapped character */
8249 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008250 o1 = ch>>11;
8251 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008252 i2 = 16*mlevel1[o1] + o2;
8253 if (mlevel2[i2] == 0xFF)
8254 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008255 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008256 i3 = 128*mlevel2[i2] + o3;
8257 mlevel3[i3] = i;
8258 }
8259 return result;
8260}
8261
8262static int
Victor Stinner22168992011-11-20 17:09:18 +01008263encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008264{
8265 struct encoding_map *map = (struct encoding_map*)mapping;
8266 int l1 = c>>11;
8267 int l2 = (c>>7) & 0xF;
8268 int l3 = c & 0x7F;
8269 int i;
8270
Victor Stinner22168992011-11-20 17:09:18 +01008271 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008272 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273 if (c == 0)
8274 return 0;
8275 /* level 1*/
8276 i = map->level1[l1];
8277 if (i == 0xFF) {
8278 return -1;
8279 }
8280 /* level 2*/
8281 i = map->level23[16*i+l2];
8282 if (i == 0xFF) {
8283 return -1;
8284 }
8285 /* level 3 */
8286 i = map->level23[16*map->count2 + 128*i + l3];
8287 if (i == 0) {
8288 return -1;
8289 }
8290 return i;
8291}
8292
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008293/* Lookup the character ch in the mapping. If the character
8294 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008295 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008296static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008297charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008298{
Christian Heimes217cfd12007-12-02 14:31:20 +00008299 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008300 PyObject *x;
8301
8302 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008303 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008304 x = PyObject_GetItem(mapping, w);
8305 Py_DECREF(w);
8306 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008307 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8308 /* No mapping found means: mapping is undefined. */
8309 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008310 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008311 } else
8312 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008313 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008314 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008315 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008316 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 long value = PyLong_AS_LONG(x);
8318 if (value < 0 || value > 255) {
8319 PyErr_SetString(PyExc_TypeError,
8320 "character mapping must be in range(256)");
8321 Py_DECREF(x);
8322 return NULL;
8323 }
8324 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008325 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008326 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008327 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008328 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 /* wrong return value */
8330 PyErr_Format(PyExc_TypeError,
8331 "character mapping must return integer, bytes or None, not %.400s",
8332 x->ob_type->tp_name);
8333 Py_DECREF(x);
8334 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008335 }
8336}
8337
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008338static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008339charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008341 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8342 /* exponentially overallocate to minimize reallocations */
8343 if (requiredsize < 2*outsize)
8344 requiredsize = 2*outsize;
8345 if (_PyBytes_Resize(outobj, requiredsize))
8346 return -1;
8347 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008348}
8349
Benjamin Peterson14339b62009-01-31 16:36:08 +00008350typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008351 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008352} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008353/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008354 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355 space is available. Return a new reference to the object that
8356 was put in the output buffer, or Py_None, if the mapping was undefined
8357 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008358 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008359static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008360charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008362{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 PyObject *rep;
8364 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008365 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008366
Christian Heimes90aa7642007-12-19 02:45:37 +00008367 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008368 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008369 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 if (res == -1)
8371 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008372 if (outsize<requiredsize)
8373 if (charmapencode_resize(outobj, outpos, requiredsize))
8374 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008375 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008376 outstart[(*outpos)++] = (char)res;
8377 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008378 }
8379
8380 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008381 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008382 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008383 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 Py_DECREF(rep);
8385 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008386 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008387 if (PyLong_Check(rep)) {
8388 Py_ssize_t requiredsize = *outpos+1;
8389 if (outsize<requiredsize)
8390 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8391 Py_DECREF(rep);
8392 return enc_EXCEPTION;
8393 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008394 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008395 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008396 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 else {
8398 const char *repchars = PyBytes_AS_STRING(rep);
8399 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8400 Py_ssize_t requiredsize = *outpos+repsize;
8401 if (outsize<requiredsize)
8402 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8403 Py_DECREF(rep);
8404 return enc_EXCEPTION;
8405 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008406 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008407 memcpy(outstart + *outpos, repchars, repsize);
8408 *outpos += repsize;
8409 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008410 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008411 Py_DECREF(rep);
8412 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008413}
8414
8415/* handle an error in PyUnicode_EncodeCharmap
8416 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008417static int
8418charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008419 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008420 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008421 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008422 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008423{
8424 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008425 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008426 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008427 enum PyUnicode_Kind kind;
8428 void *data;
8429 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008430 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008431 Py_ssize_t collstartpos = *inpos;
8432 Py_ssize_t collendpos = *inpos+1;
8433 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008434 const char *encoding = "charmap";
8435 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008436 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008437 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008438 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008439
Benjamin Petersonbac79492012-01-14 13:34:47 -05008440 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008441 return -1;
8442 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008443 /* find all unencodable characters */
8444 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008445 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008446 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008447 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008448 val = encoding_map_lookup(ch, mapping);
8449 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008450 break;
8451 ++collendpos;
8452 continue;
8453 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008454
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008455 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8456 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008457 if (rep==NULL)
8458 return -1;
8459 else if (rep!=Py_None) {
8460 Py_DECREF(rep);
8461 break;
8462 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008463 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008464 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008465 }
8466 /* cache callback name lookup
8467 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008468 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008469 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008470
8471 switch (*error_handler) {
8472 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008473 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008474 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008475
8476 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008477 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008478 x = charmapencode_output('?', mapping, res, respos);
8479 if (x==enc_EXCEPTION) {
8480 return -1;
8481 }
8482 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008483 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008484 return -1;
8485 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008486 }
8487 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008488 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008489 *inpos = collendpos;
8490 break;
Victor Stinner50149202015-09-22 00:26:54 +02008491
8492 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008493 /* generate replacement (temporarily (mis)uses p) */
8494 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008495 char buffer[2+29+1+1];
8496 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008497 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008498 for (cp = buffer; *cp; ++cp) {
8499 x = charmapencode_output(*cp, mapping, res, respos);
8500 if (x==enc_EXCEPTION)
8501 return -1;
8502 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008503 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008504 return -1;
8505 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008506 }
8507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008508 *inpos = collendpos;
8509 break;
Victor Stinner50149202015-09-22 00:26:54 +02008510
Benjamin Peterson14339b62009-01-31 16:36:08 +00008511 default:
Victor Stinner50149202015-09-22 00:26:54 +02008512 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008513 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008514 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008515 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008517 if (PyBytes_Check(repunicode)) {
8518 /* Directly copy bytes result to output. */
8519 Py_ssize_t outsize = PyBytes_Size(*res);
8520 Py_ssize_t requiredsize;
8521 repsize = PyBytes_Size(repunicode);
8522 requiredsize = *respos + repsize;
8523 if (requiredsize > outsize)
8524 /* Make room for all additional bytes. */
8525 if (charmapencode_resize(res, respos, requiredsize)) {
8526 Py_DECREF(repunicode);
8527 return -1;
8528 }
8529 memcpy(PyBytes_AsString(*res) + *respos,
8530 PyBytes_AsString(repunicode), repsize);
8531 *respos += repsize;
8532 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008533 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008534 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008535 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008536 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008537 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008538 Py_DECREF(repunicode);
8539 return -1;
8540 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008541 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008542 data = PyUnicode_DATA(repunicode);
8543 kind = PyUnicode_KIND(repunicode);
8544 for (index = 0; index < repsize; index++) {
8545 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8546 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008547 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008548 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 return -1;
8550 }
8551 else if (x==enc_FAILED) {
8552 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008553 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008554 return -1;
8555 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008556 }
8557 *inpos = newpos;
8558 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008559 }
8560 return 0;
8561}
8562
Alexander Belopolsky40018472011-02-26 01:02:56 +00008563PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008564_PyUnicode_EncodeCharmap(PyObject *unicode,
8565 PyObject *mapping,
8566 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008567{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008568 /* output object */
8569 PyObject *res = NULL;
8570 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008571 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008572 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008573 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008574 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008575 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008576 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008577 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008578 void *data;
8579 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008580
Benjamin Petersonbac79492012-01-14 13:34:47 -05008581 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008582 return NULL;
8583 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008584 data = PyUnicode_DATA(unicode);
8585 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008586
Guido van Rossumd57fd912000-03-10 22:53:23 +00008587 /* Default to Latin-1 */
8588 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008589 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008590
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008591 /* allocate enough for a simple encoding without
8592 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008593 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008594 if (res == NULL)
8595 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008596 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008597 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008598
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008599 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008600 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008601 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008602 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 if (x==enc_EXCEPTION) /* error */
8604 goto onError;
8605 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008606 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008607 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008608 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 &res, &respos)) {
8610 goto onError;
8611 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008612 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 else
8614 /* done with this character => adjust input position */
8615 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008616 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008617
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008618 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008619 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008620 if (_PyBytes_Resize(&res, respos) < 0)
8621 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008622
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008623 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008624 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 return res;
8626
Benjamin Peterson29060642009-01-31 22:14:21 +00008627 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008628 Py_XDECREF(res);
8629 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008630 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008631 return NULL;
8632}
8633
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008634/* Deprecated */
8635PyObject *
8636PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8637 Py_ssize_t size,
8638 PyObject *mapping,
8639 const char *errors)
8640{
8641 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008642 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008643 if (unicode == NULL)
8644 return NULL;
8645 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8646 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008647 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008648}
8649
Alexander Belopolsky40018472011-02-26 01:02:56 +00008650PyObject *
8651PyUnicode_AsCharmapString(PyObject *unicode,
8652 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008653{
8654 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008655 PyErr_BadArgument();
8656 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008657 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008658 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659}
8660
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008661/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008662static void
8663make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008664 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008665 Py_ssize_t startpos, Py_ssize_t endpos,
8666 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008667{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008668 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008669 *exceptionObject = _PyUnicodeTranslateError_Create(
8670 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008671 }
8672 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008673 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8674 goto onError;
8675 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8676 goto onError;
8677 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8678 goto onError;
8679 return;
8680 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008681 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008682 }
8683}
8684
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008685/* error handling callback helper:
8686 build arguments, call the callback and check the arguments,
8687 put the result into newpos and return the replacement string, which
8688 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008689static PyObject *
8690unicode_translate_call_errorhandler(const char *errors,
8691 PyObject **errorHandler,
8692 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008693 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008694 Py_ssize_t startpos, Py_ssize_t endpos,
8695 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008696{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008697 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008699 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700 PyObject *restuple;
8701 PyObject *resunicode;
8702
8703 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008704 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008705 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 }
8708
8709 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008710 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008711 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008712 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008714 restuple = PyObject_CallFunctionObjArgs(
8715 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008716 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008717 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008719 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008720 Py_DECREF(restuple);
8721 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008722 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008723 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008724 &resunicode, &i_newpos)) {
8725 Py_DECREF(restuple);
8726 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008727 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008728 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008729 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008730 else
8731 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008732 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008733 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008734 Py_DECREF(restuple);
8735 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008736 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008737 Py_INCREF(resunicode);
8738 Py_DECREF(restuple);
8739 return resunicode;
8740}
8741
8742/* Lookup the character ch in the mapping and put the result in result,
8743 which must be decrefed by the caller.
8744 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008745static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008746charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008747{
Christian Heimes217cfd12007-12-02 14:31:20 +00008748 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749 PyObject *x;
8750
8751 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008752 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008753 x = PyObject_GetItem(mapping, w);
8754 Py_DECREF(w);
8755 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008756 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8757 /* No mapping found means: use 1:1 mapping. */
8758 PyErr_Clear();
8759 *result = NULL;
8760 return 0;
8761 } else
8762 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008763 }
8764 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008765 *result = x;
8766 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008767 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008768 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008769 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008770 if (value < 0 || value > MAX_UNICODE) {
8771 PyErr_Format(PyExc_ValueError,
8772 "character mapping must be in range(0x%x)",
8773 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008774 Py_DECREF(x);
8775 return -1;
8776 }
8777 *result = x;
8778 return 0;
8779 }
8780 else if (PyUnicode_Check(x)) {
8781 *result = x;
8782 return 0;
8783 }
8784 else {
8785 /* wrong return value */
8786 PyErr_SetString(PyExc_TypeError,
8787 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008788 Py_DECREF(x);
8789 return -1;
8790 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008791}
Victor Stinner1194ea02014-04-04 19:37:40 +02008792
8793/* lookup the character, write the result into the writer.
8794 Return 1 if the result was written into the writer, return 0 if the mapping
8795 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008796static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008797charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8798 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008799{
Victor Stinner1194ea02014-04-04 19:37:40 +02008800 PyObject *item;
8801
8802 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008803 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008804
8805 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008806 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008807 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008808 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008809 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008810 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008811 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008812
8813 if (item == Py_None) {
8814 Py_DECREF(item);
8815 return 0;
8816 }
8817
8818 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008819 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8820 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8821 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008822 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8823 Py_DECREF(item);
8824 return -1;
8825 }
8826 Py_DECREF(item);
8827 return 1;
8828 }
8829
8830 if (!PyUnicode_Check(item)) {
8831 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008832 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008833 }
8834
8835 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8836 Py_DECREF(item);
8837 return -1;
8838 }
8839
8840 Py_DECREF(item);
8841 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008842}
8843
Victor Stinner89a76ab2014-04-05 11:44:04 +02008844static int
8845unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8846 Py_UCS1 *translate)
8847{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008848 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008849 int ret = 0;
8850
Victor Stinner89a76ab2014-04-05 11:44:04 +02008851 if (charmaptranslate_lookup(ch, mapping, &item)) {
8852 return -1;
8853 }
8854
8855 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008856 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008857 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008858 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008859 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860 /* not found => default to 1:1 mapping */
8861 translate[ch] = ch;
8862 return 1;
8863 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008864 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008865 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008866 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8867 used it */
8868 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008869 /* invalid character or character outside ASCII:
8870 skip the fast translate */
8871 goto exit;
8872 }
8873 translate[ch] = (Py_UCS1)replace;
8874 }
8875 else if (PyUnicode_Check(item)) {
8876 Py_UCS4 replace;
8877
8878 if (PyUnicode_READY(item) == -1) {
8879 Py_DECREF(item);
8880 return -1;
8881 }
8882 if (PyUnicode_GET_LENGTH(item) != 1)
8883 goto exit;
8884
8885 replace = PyUnicode_READ_CHAR(item, 0);
8886 if (replace > 127)
8887 goto exit;
8888 translate[ch] = (Py_UCS1)replace;
8889 }
8890 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008891 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008892 goto exit;
8893 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008894 ret = 1;
8895
Benjamin Peterson1365de72014-04-07 20:15:41 -04008896 exit:
8897 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008898 return ret;
8899}
8900
8901/* Fast path for ascii => ascii translation. Return 1 if the whole string
8902 was translated into writer, return 0 if the input string was partially
8903 translated into writer, raise an exception and return -1 on error. */
8904static int
8905unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008906 _PyUnicodeWriter *writer, int ignore,
8907 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008908{
Victor Stinner872b2912014-04-05 14:27:07 +02008909 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910 Py_ssize_t len;
8911 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008912 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008913
Victor Stinner89a76ab2014-04-05 11:44:04 +02008914 len = PyUnicode_GET_LENGTH(input);
8915
Victor Stinner872b2912014-04-05 14:27:07 +02008916 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008917
8918 in = PyUnicode_1BYTE_DATA(input);
8919 end = in + len;
8920
8921 assert(PyUnicode_IS_ASCII(writer->buffer));
8922 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8923 out = PyUnicode_1BYTE_DATA(writer->buffer);
8924
Victor Stinner872b2912014-04-05 14:27:07 +02008925 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008926 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008927 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008928 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008929 int translate = unicode_fast_translate_lookup(mapping, ch,
8930 ascii_table);
8931 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008932 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008933 if (translate == 0)
8934 goto exit;
8935 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008936 }
Victor Stinner872b2912014-04-05 14:27:07 +02008937 if (ch2 == 0xfe) {
8938 if (ignore)
8939 continue;
8940 goto exit;
8941 }
8942 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008943 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008944 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008945 }
Victor Stinner872b2912014-04-05 14:27:07 +02008946 res = 1;
8947
8948exit:
8949 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008950 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008951 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008952}
8953
Victor Stinner3222da22015-10-01 22:07:32 +02008954static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008955_PyUnicode_TranslateCharmap(PyObject *input,
8956 PyObject *mapping,
8957 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008958{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008959 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008960 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 Py_ssize_t size, i;
8962 int kind;
8963 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008964 _PyUnicodeWriter writer;
8965 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008966 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008967 PyObject *errorHandler = NULL;
8968 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008969 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008970 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008971
Guido van Rossumd57fd912000-03-10 22:53:23 +00008972 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008973 PyErr_BadArgument();
8974 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008975 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008976
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008977 if (PyUnicode_READY(input) == -1)
8978 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008979 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008980 kind = PyUnicode_KIND(input);
8981 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008983 if (size == 0)
8984 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008985
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008986 /* allocate enough for a simple 1:1 translation without
8987 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008988 _PyUnicodeWriter_Init(&writer);
8989 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008990 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008991
Victor Stinner872b2912014-04-05 14:27:07 +02008992 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8993
Victor Stinner33798672016-03-01 21:59:58 +01008994 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008995 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008996 if (PyUnicode_IS_ASCII(input)) {
8997 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
8998 if (res < 0) {
8999 _PyUnicodeWriter_Dealloc(&writer);
9000 return NULL;
9001 }
9002 if (res == 1)
9003 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009004 }
Victor Stinner33798672016-03-01 21:59:58 +01009005 else {
9006 i = 0;
9007 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009008
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009009 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009010 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009011 int translate;
9012 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9013 Py_ssize_t newpos;
9014 /* startpos for collecting untranslatable chars */
9015 Py_ssize_t collstart;
9016 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009017 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009018
Victor Stinner1194ea02014-04-04 19:37:40 +02009019 ch = PyUnicode_READ(kind, data, i);
9020 translate = charmaptranslate_output(ch, mapping, &writer);
9021 if (translate < 0)
9022 goto onError;
9023
9024 if (translate != 0) {
9025 /* it worked => adjust input pointer */
9026 ++i;
9027 continue;
9028 }
9029
9030 /* untranslatable character */
9031 collstart = i;
9032 collend = i+1;
9033
9034 /* find all untranslatable characters */
9035 while (collend < size) {
9036 PyObject *x;
9037 ch = PyUnicode_READ(kind, data, collend);
9038 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009039 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009040 Py_XDECREF(x);
9041 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009042 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009043 ++collend;
9044 }
9045
9046 if (ignore) {
9047 i = collend;
9048 }
9049 else {
9050 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9051 reason, input, &exc,
9052 collstart, collend, &newpos);
9053 if (repunicode == NULL)
9054 goto onError;
9055 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009056 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009057 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009058 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 Py_DECREF(repunicode);
9060 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009061 }
9062 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009063 Py_XDECREF(exc);
9064 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009065 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009066
Benjamin Peterson29060642009-01-31 22:14:21 +00009067 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009068 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009069 Py_XDECREF(exc);
9070 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009071 return NULL;
9072}
9073
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009074/* Deprecated. Use PyUnicode_Translate instead. */
9075PyObject *
9076PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9077 Py_ssize_t size,
9078 PyObject *mapping,
9079 const char *errors)
9080{
Christian Heimes5f520f42012-09-11 14:03:25 +02009081 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009082 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009083 if (!unicode)
9084 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009085 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9086 Py_DECREF(unicode);
9087 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009088}
9089
Alexander Belopolsky40018472011-02-26 01:02:56 +00009090PyObject *
9091PyUnicode_Translate(PyObject *str,
9092 PyObject *mapping,
9093 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009094{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009095 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009096 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009097 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009098}
Tim Petersced69f82003-09-16 20:30:58 +00009099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009100PyObject *
9101_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9102{
9103 if (!PyUnicode_Check(unicode)) {
9104 PyErr_BadInternalCall();
9105 return NULL;
9106 }
9107 if (PyUnicode_READY(unicode) == -1)
9108 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009109 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009110 /* If the string is already ASCII, just return the same string */
9111 Py_INCREF(unicode);
9112 return unicode;
9113 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009114
9115 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9116 PyObject *result = PyUnicode_New(len, 127);
9117 if (result == NULL) {
9118 return NULL;
9119 }
9120
9121 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9122 int kind = PyUnicode_KIND(unicode);
9123 const void *data = PyUnicode_DATA(unicode);
9124 Py_ssize_t i;
9125 for (i = 0; i < len; ++i) {
9126 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9127 if (ch < 127) {
9128 out[i] = ch;
9129 }
9130 else if (Py_UNICODE_ISSPACE(ch)) {
9131 out[i] = ' ';
9132 }
9133 else {
9134 int decimal = Py_UNICODE_TODECIMAL(ch);
9135 if (decimal < 0) {
9136 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009137 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009138 _PyUnicode_LENGTH(result) = i + 1;
9139 break;
9140 }
9141 out[i] = '0' + decimal;
9142 }
9143 }
9144
INADA Naoki16dfca42018-07-14 12:06:43 +09009145 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009146 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009147}
9148
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009149PyObject *
9150PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9151 Py_ssize_t length)
9152{
Victor Stinnerf0124502011-11-21 23:12:56 +01009153 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009154 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009155 Py_UCS4 maxchar;
9156 enum PyUnicode_Kind kind;
9157 void *data;
9158
Victor Stinner99d7ad02012-02-22 13:37:39 +01009159 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009160 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009161 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009162 if (ch > 127) {
9163 int decimal = Py_UNICODE_TODECIMAL(ch);
9164 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009165 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009166 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009167 }
9168 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009169
9170 /* Copy to a new string */
9171 decimal = PyUnicode_New(length, maxchar);
9172 if (decimal == NULL)
9173 return decimal;
9174 kind = PyUnicode_KIND(decimal);
9175 data = PyUnicode_DATA(decimal);
9176 /* Iterate over code points */
9177 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009178 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009179 if (ch > 127) {
9180 int decimal = Py_UNICODE_TODECIMAL(ch);
9181 if (decimal >= 0)
9182 ch = '0' + decimal;
9183 }
9184 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009185 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009186 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009187}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009188/* --- Decimal Encoder ---------------------------------------------------- */
9189
Alexander Belopolsky40018472011-02-26 01:02:56 +00009190int
9191PyUnicode_EncodeDecimal(Py_UNICODE *s,
9192 Py_ssize_t length,
9193 char *output,
9194 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009195{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009196 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009197 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009198 enum PyUnicode_Kind kind;
9199 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009200
9201 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009202 PyErr_BadArgument();
9203 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009204 }
9205
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009206 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009207 if (unicode == NULL)
9208 return -1;
9209
Victor Stinner42bf7752011-11-21 22:52:58 +01009210 kind = PyUnicode_KIND(unicode);
9211 data = PyUnicode_DATA(unicode);
9212
Victor Stinnerb84d7232011-11-22 01:50:07 +01009213 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009214 PyObject *exc;
9215 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009216 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009217 Py_ssize_t startpos;
9218
9219 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009220
Benjamin Peterson29060642009-01-31 22:14:21 +00009221 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009222 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009223 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009224 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009225 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 decimal = Py_UNICODE_TODECIMAL(ch);
9227 if (decimal >= 0) {
9228 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009229 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009230 continue;
9231 }
9232 if (0 < ch && ch < 256) {
9233 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009234 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009235 continue;
9236 }
Victor Stinner6345be92011-11-25 20:09:01 +01009237
Victor Stinner42bf7752011-11-21 22:52:58 +01009238 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009239 exc = NULL;
9240 raise_encode_exception(&exc, "decimal", unicode,
9241 startpos, startpos+1,
9242 "invalid decimal Unicode string");
9243 Py_XDECREF(exc);
9244 Py_DECREF(unicode);
9245 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009246 }
9247 /* 0-terminate the output string */
9248 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009249 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009250 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009251}
9252
Guido van Rossumd57fd912000-03-10 22:53:23 +00009253/* --- Helpers ------------------------------------------------------------ */
9254
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009255/* helper macro to fixup start/end slice values */
9256#define ADJUST_INDICES(start, end, len) \
9257 if (end > len) \
9258 end = len; \
9259 else if (end < 0) { \
9260 end += len; \
9261 if (end < 0) \
9262 end = 0; \
9263 } \
9264 if (start < 0) { \
9265 start += len; \
9266 if (start < 0) \
9267 start = 0; \
9268 }
9269
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009270static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009271any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009273 Py_ssize_t end,
9274 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009275{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009276 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277 void *buf1, *buf2;
9278 Py_ssize_t len1, len2, result;
9279
9280 kind1 = PyUnicode_KIND(s1);
9281 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009282 if (kind1 < kind2)
9283 return -1;
9284
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009285 len1 = PyUnicode_GET_LENGTH(s1);
9286 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009287 ADJUST_INDICES(start, end, len1);
9288 if (end - start < len2)
9289 return -1;
9290
9291 buf1 = PyUnicode_DATA(s1);
9292 buf2 = PyUnicode_DATA(s2);
9293 if (len2 == 1) {
9294 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9295 result = findchar((const char *)buf1 + kind1*start,
9296 kind1, end - start, ch, direction);
9297 if (result == -1)
9298 return -1;
9299 else
9300 return start + result;
9301 }
9302
9303 if (kind2 != kind1) {
9304 buf2 = _PyUnicode_AsKind(s2, kind1);
9305 if (!buf2)
9306 return -2;
9307 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009308
Victor Stinner794d5672011-10-10 03:21:36 +02009309 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009310 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009311 case PyUnicode_1BYTE_KIND:
9312 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9313 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9314 else
9315 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9316 break;
9317 case PyUnicode_2BYTE_KIND:
9318 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9319 break;
9320 case PyUnicode_4BYTE_KIND:
9321 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9322 break;
9323 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009324 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009325 }
9326 }
9327 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009328 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009329 case PyUnicode_1BYTE_KIND:
9330 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9331 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9332 else
9333 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9334 break;
9335 case PyUnicode_2BYTE_KIND:
9336 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9337 break;
9338 case PyUnicode_4BYTE_KIND:
9339 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9340 break;
9341 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009342 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009343 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009344 }
9345
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009346 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009347 PyMem_Free(buf2);
9348
9349 return result;
9350}
9351
Victor Stinner59423e32018-11-26 13:40:01 +01009352/* _PyUnicode_InsertThousandsGrouping() helper functions */
9353#include "stringlib/localeutil.h"
9354
9355/**
9356 * InsertThousandsGrouping:
9357 * @writer: Unicode writer.
9358 * @n_buffer: Number of characters in @buffer.
9359 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9360 * @d_pos: Start of digits string.
9361 * @n_digits: The number of digits in the string, in which we want
9362 * to put the grouping chars.
9363 * @min_width: The minimum width of the digits in the output string.
9364 * Output will be zero-padded on the left to fill.
9365 * @grouping: see definition in localeconv().
9366 * @thousands_sep: see definition in localeconv().
9367 *
9368 * There are 2 modes: counting and filling. If @writer is NULL,
9369 * we are in counting mode, else filling mode.
9370 * If counting, the required buffer size is returned.
9371 * If filling, we know the buffer will be large enough, so we don't
9372 * need to pass in the buffer size.
9373 * Inserts thousand grouping characters (as defined by grouping and
9374 * thousands_sep) into @writer.
9375 *
9376 * Return value: -1 on error, number of characters otherwise.
9377 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009378Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009379_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009380 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009381 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009382 PyObject *digits,
9383 Py_ssize_t d_pos,
9384 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009385 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009386 const char *grouping,
9387 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009388 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389{
Xtreak3f7983a2019-01-07 20:39:14 +05309390 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009391 if (writer) {
9392 assert(digits != NULL);
9393 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009394 }
9395 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009396 assert(digits == NULL);
9397 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009398 }
Victor Stinner59423e32018-11-26 13:40:01 +01009399 assert(0 <= d_pos);
9400 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009401 assert(grouping != NULL);
9402
9403 if (digits != NULL) {
9404 if (PyUnicode_READY(digits) == -1) {
9405 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009406 }
Victor Stinner59423e32018-11-26 13:40:01 +01009407 }
9408 if (PyUnicode_READY(thousands_sep) == -1) {
9409 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009410 }
9411
Victor Stinner59423e32018-11-26 13:40:01 +01009412 Py_ssize_t count = 0;
9413 Py_ssize_t n_zeros;
9414 int loop_broken = 0;
9415 int use_separator = 0; /* First time through, don't append the
9416 separator. They only go between
9417 groups. */
9418 Py_ssize_t buffer_pos;
9419 Py_ssize_t digits_pos;
9420 Py_ssize_t len;
9421 Py_ssize_t n_chars;
9422 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9423 be looked at */
9424 /* A generator that returns all of the grouping widths, until it
9425 returns 0. */
9426 GroupGenerator groupgen;
9427 GroupGenerator_init(&groupgen, grouping);
9428 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9429
9430 /* if digits are not grouped, thousands separator
9431 should be an empty string */
9432 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9433
9434 digits_pos = d_pos + n_digits;
9435 if (writer) {
9436 buffer_pos = writer->pos + n_buffer;
9437 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9438 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009439 }
Victor Stinner59423e32018-11-26 13:40:01 +01009440 else {
9441 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009442 }
Victor Stinner59423e32018-11-26 13:40:01 +01009443
9444 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009445 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009446 }
Victor Stinner59423e32018-11-26 13:40:01 +01009447
9448 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9449 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9450 n_zeros = Py_MAX(0, len - remaining);
9451 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9452
9453 /* Use n_zero zero's and n_chars chars */
9454
9455 /* Count only, don't do anything. */
9456 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9457
9458 /* Copy into the writer. */
9459 InsertThousandsGrouping_fill(writer, &buffer_pos,
9460 digits, &digits_pos,
9461 n_chars, n_zeros,
9462 use_separator ? thousands_sep : NULL,
9463 thousands_sep_len, maxchar);
9464
9465 /* Use a separator next time. */
9466 use_separator = 1;
9467
9468 remaining -= n_chars;
9469 min_width -= len;
9470
9471 if (remaining <= 0 && min_width <= 0) {
9472 loop_broken = 1;
9473 break;
9474 }
9475 min_width -= thousands_sep_len;
9476 }
9477 if (!loop_broken) {
9478 /* We left the loop without using a break statement. */
9479
9480 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9481 n_zeros = Py_MAX(0, len - remaining);
9482 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9483
9484 /* Use n_zero zero's and n_chars chars */
9485 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9486
9487 /* Copy into the writer. */
9488 InsertThousandsGrouping_fill(writer, &buffer_pos,
9489 digits, &digits_pos,
9490 n_chars, n_zeros,
9491 use_separator ? thousands_sep : NULL,
9492 thousands_sep_len, maxchar);
9493 }
9494 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495}
9496
9497
Alexander Belopolsky40018472011-02-26 01:02:56 +00009498Py_ssize_t
9499PyUnicode_Count(PyObject *str,
9500 PyObject *substr,
9501 Py_ssize_t start,
9502 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009503{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009504 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009505 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506 void *buf1 = NULL, *buf2 = NULL;
9507 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009508
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009509 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009510 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009511
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009512 kind1 = PyUnicode_KIND(str);
9513 kind2 = PyUnicode_KIND(substr);
9514 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009515 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009516
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009517 len1 = PyUnicode_GET_LENGTH(str);
9518 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009519 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009520 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009521 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009522
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009523 buf1 = PyUnicode_DATA(str);
9524 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009525 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009526 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009527 if (!buf2)
9528 goto onError;
9529 }
9530
9531 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009532 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009533 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009534 result = asciilib_count(
9535 ((Py_UCS1*)buf1) + start, end - start,
9536 buf2, len2, PY_SSIZE_T_MAX
9537 );
9538 else
9539 result = ucs1lib_count(
9540 ((Py_UCS1*)buf1) + start, end - start,
9541 buf2, len2, PY_SSIZE_T_MAX
9542 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009543 break;
9544 case PyUnicode_2BYTE_KIND:
9545 result = ucs2lib_count(
9546 ((Py_UCS2*)buf1) + start, end - start,
9547 buf2, len2, PY_SSIZE_T_MAX
9548 );
9549 break;
9550 case PyUnicode_4BYTE_KIND:
9551 result = ucs4lib_count(
9552 ((Py_UCS4*)buf1) + start, end - start,
9553 buf2, len2, PY_SSIZE_T_MAX
9554 );
9555 break;
9556 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009557 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009558 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009559
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009560 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009561 PyMem_Free(buf2);
9562
Guido van Rossumd57fd912000-03-10 22:53:23 +00009563 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009564 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009565 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 PyMem_Free(buf2);
9567 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009568}
9569
Alexander Belopolsky40018472011-02-26 01:02:56 +00009570Py_ssize_t
9571PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009572 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009573 Py_ssize_t start,
9574 Py_ssize_t end,
9575 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009576{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009577 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009578 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009579
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009580 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009581}
9582
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009583Py_ssize_t
9584PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9585 Py_ssize_t start, Py_ssize_t end,
9586 int direction)
9587{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009588 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009589 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 if (PyUnicode_READY(str) == -1)
9591 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009592 len = PyUnicode_GET_LENGTH(str);
9593 ADJUST_INDICES(start, end, len);
9594 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009595 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009596 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009597 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9598 kind, end-start, ch, direction);
9599 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009600 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009601 else
9602 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009603}
9604
Alexander Belopolsky40018472011-02-26 01:02:56 +00009605static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009606tailmatch(PyObject *self,
9607 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009608 Py_ssize_t start,
9609 Py_ssize_t end,
9610 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009611{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612 int kind_self;
9613 int kind_sub;
9614 void *data_self;
9615 void *data_sub;
9616 Py_ssize_t offset;
9617 Py_ssize_t i;
9618 Py_ssize_t end_sub;
9619
9620 if (PyUnicode_READY(self) == -1 ||
9621 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009622 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009624 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9625 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009626 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009627 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009629 if (PyUnicode_GET_LENGTH(substring) == 0)
9630 return 1;
9631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009632 kind_self = PyUnicode_KIND(self);
9633 data_self = PyUnicode_DATA(self);
9634 kind_sub = PyUnicode_KIND(substring);
9635 data_sub = PyUnicode_DATA(substring);
9636 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9637
9638 if (direction > 0)
9639 offset = end;
9640 else
9641 offset = start;
9642
9643 if (PyUnicode_READ(kind_self, data_self, offset) ==
9644 PyUnicode_READ(kind_sub, data_sub, 0) &&
9645 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9646 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9647 /* If both are of the same kind, memcmp is sufficient */
9648 if (kind_self == kind_sub) {
9649 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009650 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009651 data_sub,
9652 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009653 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009654 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009655 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 else {
9657 /* We do not need to compare 0 and len(substring)-1 because
9658 the if statement above ensured already that they are equal
9659 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 for (i = 1; i < end_sub; ++i) {
9661 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9662 PyUnicode_READ(kind_sub, data_sub, i))
9663 return 0;
9664 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009665 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009666 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009667 }
9668
9669 return 0;
9670}
9671
Alexander Belopolsky40018472011-02-26 01:02:56 +00009672Py_ssize_t
9673PyUnicode_Tailmatch(PyObject *str,
9674 PyObject *substr,
9675 Py_ssize_t start,
9676 Py_ssize_t end,
9677 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009678{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009679 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009680 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009681
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009682 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009683}
9684
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009685static PyObject *
9686ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009687{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009688 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9689 char *resdata, *data = PyUnicode_DATA(self);
9690 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009691
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009692 res = PyUnicode_New(len, 127);
9693 if (res == NULL)
9694 return NULL;
9695 resdata = PyUnicode_DATA(res);
9696 if (lower)
9697 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009698 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009699 _Py_bytes_upper(resdata, data, len);
9700 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009701}
9702
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009703static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009704handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009705{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706 Py_ssize_t j;
9707 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009708 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009709 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009710
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9712
9713 where ! is a negation and \p{xxx} is a character with property xxx.
9714 */
9715 for (j = i - 1; j >= 0; j--) {
9716 c = PyUnicode_READ(kind, data, j);
9717 if (!_PyUnicode_IsCaseIgnorable(c))
9718 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009719 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009720 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9721 if (final_sigma) {
9722 for (j = i + 1; j < length; j++) {
9723 c = PyUnicode_READ(kind, data, j);
9724 if (!_PyUnicode_IsCaseIgnorable(c))
9725 break;
9726 }
9727 final_sigma = j == length || !_PyUnicode_IsCased(c);
9728 }
9729 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009730}
9731
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009732static int
9733lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9734 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009735{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009736 /* Obscure special case. */
9737 if (c == 0x3A3) {
9738 mapped[0] = handle_capital_sigma(kind, data, length, i);
9739 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009740 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009741 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742}
9743
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009744static Py_ssize_t
9745do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009746{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009747 Py_ssize_t i, k = 0;
9748 int n_res, j;
9749 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009750
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009751 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009752 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009753 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009754 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009756 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009757 for (i = 1; i < length; i++) {
9758 c = PyUnicode_READ(kind, data, i);
9759 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9760 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009761 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009762 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009763 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009764 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009765 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009766}
9767
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009768static Py_ssize_t
9769do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9770 Py_ssize_t i, k = 0;
9771
9772 for (i = 0; i < length; i++) {
9773 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9774 int n_res, j;
9775 if (Py_UNICODE_ISUPPER(c)) {
9776 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9777 }
9778 else if (Py_UNICODE_ISLOWER(c)) {
9779 n_res = _PyUnicode_ToUpperFull(c, mapped);
9780 }
9781 else {
9782 n_res = 1;
9783 mapped[0] = c;
9784 }
9785 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009786 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009787 res[k++] = mapped[j];
9788 }
9789 }
9790 return k;
9791}
9792
9793static Py_ssize_t
9794do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9795 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009796{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009797 Py_ssize_t i, k = 0;
9798
9799 for (i = 0; i < length; i++) {
9800 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9801 int n_res, j;
9802 if (lower)
9803 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9804 else
9805 n_res = _PyUnicode_ToUpperFull(c, mapped);
9806 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009807 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009808 res[k++] = mapped[j];
9809 }
9810 }
9811 return k;
9812}
9813
9814static Py_ssize_t
9815do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9816{
9817 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9818}
9819
9820static Py_ssize_t
9821do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9822{
9823 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9824}
9825
Benjamin Petersone51757f2012-01-12 21:10:29 -05009826static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009827do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9828{
9829 Py_ssize_t i, k = 0;
9830
9831 for (i = 0; i < length; i++) {
9832 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9833 Py_UCS4 mapped[3];
9834 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9835 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009836 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009837 res[k++] = mapped[j];
9838 }
9839 }
9840 return k;
9841}
9842
9843static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009844do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9845{
9846 Py_ssize_t i, k = 0;
9847 int previous_is_cased;
9848
9849 previous_is_cased = 0;
9850 for (i = 0; i < length; i++) {
9851 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9852 Py_UCS4 mapped[3];
9853 int n_res, j;
9854
9855 if (previous_is_cased)
9856 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9857 else
9858 n_res = _PyUnicode_ToTitleFull(c, mapped);
9859
9860 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009861 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009862 res[k++] = mapped[j];
9863 }
9864
9865 previous_is_cased = _PyUnicode_IsCased(c);
9866 }
9867 return k;
9868}
9869
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009870static PyObject *
9871case_operation(PyObject *self,
9872 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9873{
9874 PyObject *res = NULL;
9875 Py_ssize_t length, newlength = 0;
9876 int kind, outkind;
9877 void *data, *outdata;
9878 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9879
Benjamin Petersoneea48462012-01-16 14:28:50 -05009880 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009881
9882 kind = PyUnicode_KIND(self);
9883 data = PyUnicode_DATA(self);
9884 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009885 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009886 PyErr_SetString(PyExc_OverflowError, "string is too long");
9887 return NULL;
9888 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009889 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009890 if (tmp == NULL)
9891 return PyErr_NoMemory();
9892 newlength = perform(kind, data, length, tmp, &maxchar);
9893 res = PyUnicode_New(newlength, maxchar);
9894 if (res == NULL)
9895 goto leave;
9896 tmpend = tmp + newlength;
9897 outdata = PyUnicode_DATA(res);
9898 outkind = PyUnicode_KIND(res);
9899 switch (outkind) {
9900 case PyUnicode_1BYTE_KIND:
9901 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9902 break;
9903 case PyUnicode_2BYTE_KIND:
9904 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9905 break;
9906 case PyUnicode_4BYTE_KIND:
9907 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9908 break;
9909 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009910 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009911 }
9912 leave:
9913 PyMem_FREE(tmp);
9914 return res;
9915}
9916
Tim Peters8ce9f162004-08-27 01:49:32 +00009917PyObject *
9918PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009919{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009920 PyObject *res;
9921 PyObject *fseq;
9922 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009923 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009924
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009925 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009926 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009927 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009928 }
9929
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009930 /* NOTE: the following code can't call back into Python code,
9931 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009932 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009933
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009934 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009935 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009936 res = _PyUnicode_JoinArray(separator, items, seqlen);
9937 Py_DECREF(fseq);
9938 return res;
9939}
9940
9941PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009942_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009943{
9944 PyObject *res = NULL; /* the result */
9945 PyObject *sep = NULL;
9946 Py_ssize_t seplen;
9947 PyObject *item;
9948 Py_ssize_t sz, i, res_offset;
9949 Py_UCS4 maxchar;
9950 Py_UCS4 item_maxchar;
9951 int use_memcpy;
9952 unsigned char *res_data = NULL, *sep_data = NULL;
9953 PyObject *last_obj;
9954 unsigned int kind = 0;
9955
Tim Peters05eba1f2004-08-27 21:32:02 +00009956 /* If empty sequence, return u"". */
9957 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009958 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009959 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009960
Tim Peters05eba1f2004-08-27 21:32:02 +00009961 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009962 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009963 if (seqlen == 1) {
9964 if (PyUnicode_CheckExact(items[0])) {
9965 res = items[0];
9966 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009967 return res;
9968 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009969 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009970 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009971 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009972 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009973 /* Set up sep and seplen */
9974 if (separator == NULL) {
9975 /* fall back to a blank space separator */
9976 sep = PyUnicode_FromOrdinal(' ');
9977 if (!sep)
9978 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009979 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009980 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009981 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009982 else {
9983 if (!PyUnicode_Check(separator)) {
9984 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009985 "separator: expected str instance,"
9986 " %.80s found",
9987 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009988 goto onError;
9989 }
9990 if (PyUnicode_READY(separator))
9991 goto onError;
9992 sep = separator;
9993 seplen = PyUnicode_GET_LENGTH(separator);
9994 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9995 /* inc refcount to keep this code path symmetric with the
9996 above case of a blank separator */
9997 Py_INCREF(sep);
9998 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009999 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010000 }
10001
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010002 /* There are at least two things to join, or else we have a subclass
10003 * of str in the sequence.
10004 * Do a pre-pass to figure out the total amount of space we'll
10005 * need (sz), and see whether all argument are strings.
10006 */
10007 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010008#ifdef Py_DEBUG
10009 use_memcpy = 0;
10010#else
10011 use_memcpy = 1;
10012#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010013 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010014 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010015 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010016 if (!PyUnicode_Check(item)) {
10017 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010018 "sequence item %zd: expected str instance,"
10019 " %.80s found",
10020 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010021 goto onError;
10022 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010023 if (PyUnicode_READY(item) == -1)
10024 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010025 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010026 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010027 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010028 if (i != 0) {
10029 add_sz += seplen;
10030 }
10031 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010032 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010033 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010034 goto onError;
10035 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010036 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010037 if (use_memcpy && last_obj != NULL) {
10038 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10039 use_memcpy = 0;
10040 }
10041 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010042 }
Tim Petersced69f82003-09-16 20:30:58 +000010043
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010044 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010045 if (res == NULL)
10046 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010047
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010048 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010049#ifdef Py_DEBUG
10050 use_memcpy = 0;
10051#else
10052 if (use_memcpy) {
10053 res_data = PyUnicode_1BYTE_DATA(res);
10054 kind = PyUnicode_KIND(res);
10055 if (seplen != 0)
10056 sep_data = PyUnicode_1BYTE_DATA(sep);
10057 }
10058#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010059 if (use_memcpy) {
10060 for (i = 0; i < seqlen; ++i) {
10061 Py_ssize_t itemlen;
10062 item = items[i];
10063
10064 /* Copy item, and maybe the separator. */
10065 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010066 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010067 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010068 kind * seplen);
10069 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010070 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010071
10072 itemlen = PyUnicode_GET_LENGTH(item);
10073 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010074 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010075 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010076 kind * itemlen);
10077 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010078 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010079 }
10080 assert(res_data == PyUnicode_1BYTE_DATA(res)
10081 + kind * PyUnicode_GET_LENGTH(res));
10082 }
10083 else {
10084 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10085 Py_ssize_t itemlen;
10086 item = items[i];
10087
10088 /* Copy item, and maybe the separator. */
10089 if (i && seplen != 0) {
10090 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10091 res_offset += seplen;
10092 }
10093
10094 itemlen = PyUnicode_GET_LENGTH(item);
10095 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010096 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010097 res_offset += itemlen;
10098 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010099 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010100 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010101 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010102
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010103 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010104 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010106
Benjamin Peterson29060642009-01-31 22:14:21 +000010107 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010108 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010109 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010110 return NULL;
10111}
10112
Victor Stinnerd3f08822012-05-29 12:57:52 +020010113void
10114_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10115 Py_UCS4 fill_char)
10116{
10117 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010118 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010119 assert(PyUnicode_IS_READY(unicode));
10120 assert(unicode_modifiable(unicode));
10121 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10122 assert(start >= 0);
10123 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010124 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010125}
10126
Victor Stinner3fe55312012-01-04 00:33:50 +010010127Py_ssize_t
10128PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10129 Py_UCS4 fill_char)
10130{
10131 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010132
10133 if (!PyUnicode_Check(unicode)) {
10134 PyErr_BadInternalCall();
10135 return -1;
10136 }
10137 if (PyUnicode_READY(unicode) == -1)
10138 return -1;
10139 if (unicode_check_modifiable(unicode))
10140 return -1;
10141
Victor Stinnerd3f08822012-05-29 12:57:52 +020010142 if (start < 0) {
10143 PyErr_SetString(PyExc_IndexError, "string index out of range");
10144 return -1;
10145 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010146 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10147 PyErr_SetString(PyExc_ValueError,
10148 "fill character is bigger than "
10149 "the string maximum character");
10150 return -1;
10151 }
10152
10153 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10154 length = Py_MIN(maxlen, length);
10155 if (length <= 0)
10156 return 0;
10157
Victor Stinnerd3f08822012-05-29 12:57:52 +020010158 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010159 return length;
10160}
10161
Victor Stinner9310abb2011-10-05 00:59:23 +020010162static PyObject *
10163pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010164 Py_ssize_t left,
10165 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010166 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010167{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 PyObject *u;
10169 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010170 int kind;
10171 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010172
10173 if (left < 0)
10174 left = 0;
10175 if (right < 0)
10176 right = 0;
10177
Victor Stinnerc4b49542011-12-11 22:44:26 +010010178 if (left == 0 && right == 0)
10179 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010180
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010181 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10182 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010183 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10184 return NULL;
10185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010186 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010187 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010189 if (!u)
10190 return NULL;
10191
10192 kind = PyUnicode_KIND(u);
10193 data = PyUnicode_DATA(u);
10194 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010195 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010196 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010197 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010198 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010199 assert(_PyUnicode_CheckConsistency(u, 1));
10200 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010201}
10202
Alexander Belopolsky40018472011-02-26 01:02:56 +000010203PyObject *
10204PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010205{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010206 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010208 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010209 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010210
Benjamin Petersonead6b532011-12-20 17:23:42 -060010211 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010212 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010213 if (PyUnicode_IS_ASCII(string))
10214 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010215 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010216 PyUnicode_GET_LENGTH(string), keepends);
10217 else
10218 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010219 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010220 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010221 break;
10222 case PyUnicode_2BYTE_KIND:
10223 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010224 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 PyUnicode_GET_LENGTH(string), keepends);
10226 break;
10227 case PyUnicode_4BYTE_KIND:
10228 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010229 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010230 PyUnicode_GET_LENGTH(string), keepends);
10231 break;
10232 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010233 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010234 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010235 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010236}
10237
Alexander Belopolsky40018472011-02-26 01:02:56 +000010238static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010239split(PyObject *self,
10240 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010241 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010242{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010243 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010244 void *buf1, *buf2;
10245 Py_ssize_t len1, len2;
10246 PyObject* out;
10247
Guido van Rossumd57fd912000-03-10 22:53:23 +000010248 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010249 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010251 if (PyUnicode_READY(self) == -1)
10252 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010253
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010254 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010255 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010257 if (PyUnicode_IS_ASCII(self))
10258 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010259 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010260 PyUnicode_GET_LENGTH(self), maxcount
10261 );
10262 else
10263 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010264 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010265 PyUnicode_GET_LENGTH(self), maxcount
10266 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010267 case PyUnicode_2BYTE_KIND:
10268 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010269 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010270 PyUnicode_GET_LENGTH(self), maxcount
10271 );
10272 case PyUnicode_4BYTE_KIND:
10273 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010274 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010275 PyUnicode_GET_LENGTH(self), maxcount
10276 );
10277 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010278 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010279 }
10280
10281 if (PyUnicode_READY(substring) == -1)
10282 return NULL;
10283
10284 kind1 = PyUnicode_KIND(self);
10285 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010286 len1 = PyUnicode_GET_LENGTH(self);
10287 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010288 if (kind1 < kind2 || len1 < len2) {
10289 out = PyList_New(1);
10290 if (out == NULL)
10291 return NULL;
10292 Py_INCREF(self);
10293 PyList_SET_ITEM(out, 0, self);
10294 return out;
10295 }
10296 buf1 = PyUnicode_DATA(self);
10297 buf2 = PyUnicode_DATA(substring);
10298 if (kind2 != kind1) {
10299 buf2 = _PyUnicode_AsKind(substring, kind1);
10300 if (!buf2)
10301 return NULL;
10302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010304 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010306 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10307 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010308 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010309 else
10310 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010311 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010312 break;
10313 case PyUnicode_2BYTE_KIND:
10314 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010315 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010316 break;
10317 case PyUnicode_4BYTE_KIND:
10318 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010319 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010320 break;
10321 default:
10322 out = NULL;
10323 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010324 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010325 PyMem_Free(buf2);
10326 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327}
10328
Alexander Belopolsky40018472011-02-26 01:02:56 +000010329static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010330rsplit(PyObject *self,
10331 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010332 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010333{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010334 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010335 void *buf1, *buf2;
10336 Py_ssize_t len1, len2;
10337 PyObject* out;
10338
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010339 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010340 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010341
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 if (PyUnicode_READY(self) == -1)
10343 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010345 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010346 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010348 if (PyUnicode_IS_ASCII(self))
10349 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010350 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010351 PyUnicode_GET_LENGTH(self), maxcount
10352 );
10353 else
10354 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010355 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010356 PyUnicode_GET_LENGTH(self), maxcount
10357 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010358 case PyUnicode_2BYTE_KIND:
10359 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010360 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 PyUnicode_GET_LENGTH(self), maxcount
10362 );
10363 case PyUnicode_4BYTE_KIND:
10364 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010365 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010366 PyUnicode_GET_LENGTH(self), maxcount
10367 );
10368 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010369 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010370 }
10371
10372 if (PyUnicode_READY(substring) == -1)
10373 return NULL;
10374
10375 kind1 = PyUnicode_KIND(self);
10376 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010377 len1 = PyUnicode_GET_LENGTH(self);
10378 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010379 if (kind1 < kind2 || len1 < len2) {
10380 out = PyList_New(1);
10381 if (out == NULL)
10382 return NULL;
10383 Py_INCREF(self);
10384 PyList_SET_ITEM(out, 0, self);
10385 return out;
10386 }
10387 buf1 = PyUnicode_DATA(self);
10388 buf2 = PyUnicode_DATA(substring);
10389 if (kind2 != kind1) {
10390 buf2 = _PyUnicode_AsKind(substring, kind1);
10391 if (!buf2)
10392 return NULL;
10393 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010394
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010395 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010397 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10398 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010399 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010400 else
10401 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010402 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 break;
10404 case PyUnicode_2BYTE_KIND:
10405 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010406 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010407 break;
10408 case PyUnicode_4BYTE_KIND:
10409 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010410 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010411 break;
10412 default:
10413 out = NULL;
10414 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010415 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010416 PyMem_Free(buf2);
10417 return out;
10418}
10419
10420static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010421anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10422 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010423{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010424 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10427 return asciilib_find(buf1, len1, buf2, len2, offset);
10428 else
10429 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010430 case PyUnicode_2BYTE_KIND:
10431 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10432 case PyUnicode_4BYTE_KIND:
10433 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10434 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010435 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010436}
10437
10438static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010439anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10440 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010441{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010442 switch (kind) {
10443 case PyUnicode_1BYTE_KIND:
10444 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10445 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10446 else
10447 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10448 case PyUnicode_2BYTE_KIND:
10449 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10450 case PyUnicode_4BYTE_KIND:
10451 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10452 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010453 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010454}
10455
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010456static void
10457replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10458 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10459{
10460 int kind = PyUnicode_KIND(u);
10461 void *data = PyUnicode_DATA(u);
10462 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10463 if (kind == PyUnicode_1BYTE_KIND) {
10464 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10465 (Py_UCS1 *)data + len,
10466 u1, u2, maxcount);
10467 }
10468 else if (kind == PyUnicode_2BYTE_KIND) {
10469 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10470 (Py_UCS2 *)data + len,
10471 u1, u2, maxcount);
10472 }
10473 else {
10474 assert(kind == PyUnicode_4BYTE_KIND);
10475 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10476 (Py_UCS4 *)data + len,
10477 u1, u2, maxcount);
10478 }
10479}
10480
Alexander Belopolsky40018472011-02-26 01:02:56 +000010481static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010482replace(PyObject *self, PyObject *str1,
10483 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010484{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010485 PyObject *u;
10486 char *sbuf = PyUnicode_DATA(self);
10487 char *buf1 = PyUnicode_DATA(str1);
10488 char *buf2 = PyUnicode_DATA(str2);
10489 int srelease = 0, release1 = 0, release2 = 0;
10490 int skind = PyUnicode_KIND(self);
10491 int kind1 = PyUnicode_KIND(str1);
10492 int kind2 = PyUnicode_KIND(str2);
10493 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10494 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10495 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010496 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010497 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010498
10499 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010500 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010501 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010502 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010503
Victor Stinner59de0ee2011-10-07 10:01:28 +020010504 if (str1 == str2)
10505 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010506
Victor Stinner49a0a212011-10-12 23:46:10 +020010507 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010508 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10509 if (maxchar < maxchar_str1)
10510 /* substring too wide to be present */
10511 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010512 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10513 /* Replacing str1 with str2 may cause a maxchar reduction in the
10514 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010515 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010516 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010517
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010518 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010519 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010521 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010523 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010524 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010525 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010526
Victor Stinner69ed0f42013-04-09 21:48:24 +020010527 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010528 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010529 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010530 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010531 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010532 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010533 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010535
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010536 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10537 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010538 }
10539 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540 int rkind = skind;
10541 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010542 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010543
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010544 if (kind1 < rkind) {
10545 /* widen substring */
10546 buf1 = _PyUnicode_AsKind(str1, rkind);
10547 if (!buf1) goto error;
10548 release1 = 1;
10549 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010550 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010551 if (i < 0)
10552 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553 if (rkind > kind2) {
10554 /* widen replacement */
10555 buf2 = _PyUnicode_AsKind(str2, rkind);
10556 if (!buf2) goto error;
10557 release2 = 1;
10558 }
10559 else if (rkind < kind2) {
10560 /* widen self and buf1 */
10561 rkind = kind2;
10562 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010563 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010564 sbuf = _PyUnicode_AsKind(self, rkind);
10565 if (!sbuf) goto error;
10566 srelease = 1;
10567 buf1 = _PyUnicode_AsKind(str1, rkind);
10568 if (!buf1) goto error;
10569 release1 = 1;
10570 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010571 u = PyUnicode_New(slen, maxchar);
10572 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010573 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010574 assert(PyUnicode_KIND(u) == rkind);
10575 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010576
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010577 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010578 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010580 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010581 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010583
10584 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010585 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010586 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010588 if (i == -1)
10589 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010590 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010591 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010592 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010594 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010595 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010596 }
10597 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010598 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010599 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 int rkind = skind;
10601 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010602
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010603 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010604 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 buf1 = _PyUnicode_AsKind(str1, rkind);
10606 if (!buf1) goto error;
10607 release1 = 1;
10608 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010609 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010610 if (n == 0)
10611 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010612 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 buf2 = _PyUnicode_AsKind(str2, rkind);
10615 if (!buf2) goto error;
10616 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010617 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010618 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010619 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 rkind = kind2;
10621 sbuf = _PyUnicode_AsKind(self, rkind);
10622 if (!sbuf) goto error;
10623 srelease = 1;
10624 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010625 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626 buf1 = _PyUnicode_AsKind(str1, rkind);
10627 if (!buf1) goto error;
10628 release1 = 1;
10629 }
10630 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10631 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010632 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010633 PyErr_SetString(PyExc_OverflowError,
10634 "replace string is too long");
10635 goto error;
10636 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010637 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010638 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010639 _Py_INCREF_UNICODE_EMPTY();
10640 if (!unicode_empty)
10641 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010642 u = unicode_empty;
10643 goto done;
10644 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010645 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010646 PyErr_SetString(PyExc_OverflowError,
10647 "replace string is too long");
10648 goto error;
10649 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010650 u = PyUnicode_New(new_size, maxchar);
10651 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010653 assert(PyUnicode_KIND(u) == rkind);
10654 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010655 ires = i = 0;
10656 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010657 while (n-- > 0) {
10658 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010659 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010660 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010662 if (j == -1)
10663 break;
10664 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010665 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010666 memcpy(res + rkind * ires,
10667 sbuf + rkind * i,
10668 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010669 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010670 }
10671 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010672 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010673 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010677 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010682 memcpy(res + rkind * ires,
10683 sbuf + rkind * i,
10684 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010685 }
10686 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010687 /* interleave */
10688 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010689 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010690 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010691 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010693 if (--n <= 0)
10694 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010695 memcpy(res + rkind * ires,
10696 sbuf + rkind * i,
10697 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010698 ires++;
10699 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010700 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010701 memcpy(res + rkind * ires,
10702 sbuf + rkind * i,
10703 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010704 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010705 }
10706
10707 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010708 unicode_adjust_maxchar(&u);
10709 if (u == NULL)
10710 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010711 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010712
10713 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 if (srelease)
10715 PyMem_FREE(sbuf);
10716 if (release1)
10717 PyMem_FREE(buf1);
10718 if (release2)
10719 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010720 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010722
Benjamin Peterson29060642009-01-31 22:14:21 +000010723 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010725 if (srelease)
10726 PyMem_FREE(sbuf);
10727 if (release1)
10728 PyMem_FREE(buf1);
10729 if (release2)
10730 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010731 return unicode_result_unchanged(self);
10732
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010733 error:
10734 if (srelease && sbuf)
10735 PyMem_FREE(sbuf);
10736 if (release1 && buf1)
10737 PyMem_FREE(buf1);
10738 if (release2 && buf2)
10739 PyMem_FREE(buf2);
10740 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010741}
10742
10743/* --- Unicode Object Methods --------------------------------------------- */
10744
INADA Naoki3ae20562017-01-16 20:41:20 +090010745/*[clinic input]
10746str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010747
INADA Naoki3ae20562017-01-16 20:41:20 +090010748Return a version of the string where each word is titlecased.
10749
10750More specifically, words start with uppercased characters and all remaining
10751cased characters have lower case.
10752[clinic start generated code]*/
10753
10754static PyObject *
10755unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010756/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010757{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010758 if (PyUnicode_READY(self) == -1)
10759 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010760 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010761}
10762
INADA Naoki3ae20562017-01-16 20:41:20 +090010763/*[clinic input]
10764str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010765
INADA Naoki3ae20562017-01-16 20:41:20 +090010766Return a capitalized version of the string.
10767
10768More specifically, make the first character have upper case and the rest lower
10769case.
10770[clinic start generated code]*/
10771
10772static PyObject *
10773unicode_capitalize_impl(PyObject *self)
10774/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010775{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010776 if (PyUnicode_READY(self) == -1)
10777 return NULL;
10778 if (PyUnicode_GET_LENGTH(self) == 0)
10779 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010780 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010781}
10782
INADA Naoki3ae20562017-01-16 20:41:20 +090010783/*[clinic input]
10784str.casefold as unicode_casefold
10785
10786Return a version of the string suitable for caseless comparisons.
10787[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010788
10789static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010790unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010791/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010792{
10793 if (PyUnicode_READY(self) == -1)
10794 return NULL;
10795 if (PyUnicode_IS_ASCII(self))
10796 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010797 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010798}
10799
10800
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010801/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010802
10803static int
10804convert_uc(PyObject *obj, void *addr)
10805{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010807
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010808 if (!PyUnicode_Check(obj)) {
10809 PyErr_Format(PyExc_TypeError,
10810 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010811 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010812 return 0;
10813 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010814 if (PyUnicode_READY(obj) < 0)
10815 return 0;
10816 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010817 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010818 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010819 return 0;
10820 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010821 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010822 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010823}
10824
INADA Naoki3ae20562017-01-16 20:41:20 +090010825/*[clinic input]
10826str.center as unicode_center
10827
10828 width: Py_ssize_t
10829 fillchar: Py_UCS4 = ' '
10830 /
10831
10832Return a centered string of length width.
10833
10834Padding is done using the specified fill character (default is a space).
10835[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010836
10837static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010838unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10839/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010840{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010841 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842
Benjamin Petersonbac79492012-01-14 13:34:47 -050010843 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844 return NULL;
10845
Victor Stinnerc4b49542011-12-11 22:44:26 +010010846 if (PyUnicode_GET_LENGTH(self) >= width)
10847 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010848
Victor Stinnerc4b49542011-12-11 22:44:26 +010010849 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850 left = marg / 2 + (marg & width & 1);
10851
Victor Stinner9310abb2011-10-05 00:59:23 +020010852 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010853}
10854
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010855/* This function assumes that str1 and str2 are readied by the caller. */
10856
Marc-André Lemburge5034372000-08-08 08:04:29 +000010857static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010858unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010859{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010860#define COMPARE(TYPE1, TYPE2) \
10861 do { \
10862 TYPE1* p1 = (TYPE1 *)data1; \
10863 TYPE2* p2 = (TYPE2 *)data2; \
10864 TYPE1* end = p1 + len; \
10865 Py_UCS4 c1, c2; \
10866 for (; p1 != end; p1++, p2++) { \
10867 c1 = *p1; \
10868 c2 = *p2; \
10869 if (c1 != c2) \
10870 return (c1 < c2) ? -1 : 1; \
10871 } \
10872 } \
10873 while (0)
10874
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010875 int kind1, kind2;
10876 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010877 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010878
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010879 kind1 = PyUnicode_KIND(str1);
10880 kind2 = PyUnicode_KIND(str2);
10881 data1 = PyUnicode_DATA(str1);
10882 data2 = PyUnicode_DATA(str2);
10883 len1 = PyUnicode_GET_LENGTH(str1);
10884 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010885 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010886
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010887 switch(kind1) {
10888 case PyUnicode_1BYTE_KIND:
10889 {
10890 switch(kind2) {
10891 case PyUnicode_1BYTE_KIND:
10892 {
10893 int cmp = memcmp(data1, data2, len);
10894 /* normalize result of memcmp() into the range [-1; 1] */
10895 if (cmp < 0)
10896 return -1;
10897 if (cmp > 0)
10898 return 1;
10899 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010900 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010901 case PyUnicode_2BYTE_KIND:
10902 COMPARE(Py_UCS1, Py_UCS2);
10903 break;
10904 case PyUnicode_4BYTE_KIND:
10905 COMPARE(Py_UCS1, Py_UCS4);
10906 break;
10907 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010908 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010909 }
10910 break;
10911 }
10912 case PyUnicode_2BYTE_KIND:
10913 {
10914 switch(kind2) {
10915 case PyUnicode_1BYTE_KIND:
10916 COMPARE(Py_UCS2, Py_UCS1);
10917 break;
10918 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010919 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010920 COMPARE(Py_UCS2, Py_UCS2);
10921 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010922 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010923 case PyUnicode_4BYTE_KIND:
10924 COMPARE(Py_UCS2, Py_UCS4);
10925 break;
10926 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010927 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010928 }
10929 break;
10930 }
10931 case PyUnicode_4BYTE_KIND:
10932 {
10933 switch(kind2) {
10934 case PyUnicode_1BYTE_KIND:
10935 COMPARE(Py_UCS4, Py_UCS1);
10936 break;
10937 case PyUnicode_2BYTE_KIND:
10938 COMPARE(Py_UCS4, Py_UCS2);
10939 break;
10940 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010941 {
10942#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10943 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10944 /* normalize result of wmemcmp() into the range [-1; 1] */
10945 if (cmp < 0)
10946 return -1;
10947 if (cmp > 0)
10948 return 1;
10949#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010950 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010951#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010952 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010953 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010954 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010955 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010956 }
10957 break;
10958 }
10959 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010960 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010961 }
10962
Victor Stinner770e19e2012-10-04 22:59:45 +020010963 if (len1 == len2)
10964 return 0;
10965 if (len1 < len2)
10966 return -1;
10967 else
10968 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010969
10970#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010971}
10972
Benjamin Peterson621b4302016-09-09 13:54:34 -070010973static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010974unicode_compare_eq(PyObject *str1, PyObject *str2)
10975{
10976 int kind;
10977 void *data1, *data2;
10978 Py_ssize_t len;
10979 int cmp;
10980
Victor Stinnere5567ad2012-10-23 02:48:49 +020010981 len = PyUnicode_GET_LENGTH(str1);
10982 if (PyUnicode_GET_LENGTH(str2) != len)
10983 return 0;
10984 kind = PyUnicode_KIND(str1);
10985 if (PyUnicode_KIND(str2) != kind)
10986 return 0;
10987 data1 = PyUnicode_DATA(str1);
10988 data2 = PyUnicode_DATA(str2);
10989
10990 cmp = memcmp(data1, data2, len * kind);
10991 return (cmp == 0);
10992}
10993
10994
Alexander Belopolsky40018472011-02-26 01:02:56 +000010995int
10996PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010997{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010998 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10999 if (PyUnicode_READY(left) == -1 ||
11000 PyUnicode_READY(right) == -1)
11001 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011002
11003 /* a string is equal to itself */
11004 if (left == right)
11005 return 0;
11006
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011007 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011008 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011009 PyErr_Format(PyExc_TypeError,
11010 "Can't compare %.100s and %.100s",
11011 left->ob_type->tp_name,
11012 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011013 return -1;
11014}
11015
Martin v. Löwis5b222132007-06-10 09:51:05 +000011016int
11017PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011019 Py_ssize_t i;
11020 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011022 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023
Victor Stinner910337b2011-10-03 03:20:16 +020011024 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011025 if (!PyUnicode_IS_READY(uni)) {
11026 const wchar_t *ws = _PyUnicode_WSTR(uni);
11027 /* Compare Unicode string and source character set string */
11028 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11029 if (chr != ustr[i])
11030 return (chr < ustr[i]) ? -1 : 1;
11031 }
11032 /* This check keeps Python strings that end in '\0' from comparing equal
11033 to C strings identical up to that point. */
11034 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11035 return 1; /* uni is longer */
11036 if (ustr[i])
11037 return -1; /* str is longer */
11038 return 0;
11039 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011040 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011041 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011042 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011043 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011044 size_t len, len2 = strlen(str);
11045 int cmp;
11046
11047 len = Py_MIN(len1, len2);
11048 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011049 if (cmp != 0) {
11050 if (cmp < 0)
11051 return -1;
11052 else
11053 return 1;
11054 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011055 if (len1 > len2)
11056 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011057 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011058 return -1; /* str is longer */
11059 return 0;
11060 }
11061 else {
11062 void *data = PyUnicode_DATA(uni);
11063 /* Compare Unicode string and source character set string */
11064 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011065 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011066 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11067 /* This check keeps Python strings that end in '\0' from comparing equal
11068 to C strings identical up to that point. */
11069 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11070 return 1; /* uni is longer */
11071 if (str[i])
11072 return -1; /* str is longer */
11073 return 0;
11074 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011075}
11076
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011077static int
11078non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11079{
11080 size_t i, len;
11081 const wchar_t *p;
11082 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11083 if (strlen(str) != len)
11084 return 0;
11085 p = _PyUnicode_WSTR(unicode);
11086 assert(p);
11087 for (i = 0; i < len; i++) {
11088 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011089 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011090 return 0;
11091 }
11092 return 1;
11093}
11094
11095int
11096_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11097{
11098 size_t len;
11099 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011100 assert(str);
11101#ifndef NDEBUG
11102 for (const char *p = str; *p; p++) {
11103 assert((unsigned char)*p < 128);
11104 }
11105#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011106 if (PyUnicode_READY(unicode) == -1) {
11107 /* Memory error or bad data */
11108 PyErr_Clear();
11109 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11110 }
11111 if (!PyUnicode_IS_ASCII(unicode))
11112 return 0;
11113 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11114 return strlen(str) == len &&
11115 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11116}
11117
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011118int
11119_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11120{
11121 PyObject *right_uni;
11122 Py_hash_t hash;
11123
11124 assert(_PyUnicode_CHECK(left));
11125 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011126#ifndef NDEBUG
11127 for (const char *p = right->string; *p; p++) {
11128 assert((unsigned char)*p < 128);
11129 }
11130#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011131
11132 if (PyUnicode_READY(left) == -1) {
11133 /* memory error or bad data */
11134 PyErr_Clear();
11135 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11136 }
11137
11138 if (!PyUnicode_IS_ASCII(left))
11139 return 0;
11140
11141 right_uni = _PyUnicode_FromId(right); /* borrowed */
11142 if (right_uni == NULL) {
11143 /* memory error or bad data */
11144 PyErr_Clear();
11145 return _PyUnicode_EqualToASCIIString(left, right->string);
11146 }
11147
11148 if (left == right_uni)
11149 return 1;
11150
11151 if (PyUnicode_CHECK_INTERNED(left))
11152 return 0;
11153
INADA Naoki7cc95f52018-01-28 02:07:09 +090011154 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011155 hash = _PyUnicode_HASH(left);
11156 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11157 return 0;
11158
11159 return unicode_compare_eq(left, right_uni);
11160}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011161
Alexander Belopolsky40018472011-02-26 01:02:56 +000011162PyObject *
11163PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011164{
11165 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011166
Victor Stinnere5567ad2012-10-23 02:48:49 +020011167 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11168 Py_RETURN_NOTIMPLEMENTED;
11169
11170 if (PyUnicode_READY(left) == -1 ||
11171 PyUnicode_READY(right) == -1)
11172 return NULL;
11173
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011174 if (left == right) {
11175 switch (op) {
11176 case Py_EQ:
11177 case Py_LE:
11178 case Py_GE:
11179 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011180 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011181 case Py_NE:
11182 case Py_LT:
11183 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011184 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011185 default:
11186 PyErr_BadArgument();
11187 return NULL;
11188 }
11189 }
11190 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011191 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011192 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011193 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011194 }
11195 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011196 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011197 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011198 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011199}
11200
Alexander Belopolsky40018472011-02-26 01:02:56 +000011201int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011202_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11203{
11204 return unicode_eq(aa, bb);
11205}
11206
11207int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011208PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011209{
Victor Stinner77282cb2013-04-14 19:22:47 +020011210 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011211 void *buf1, *buf2;
11212 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011213 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011214
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011215 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011216 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011217 "'in <string>' requires string as left operand, not %.100s",
11218 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011219 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011220 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011221 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011222 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011223 if (ensure_unicode(str) < 0)
11224 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011225
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011226 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011227 kind2 = PyUnicode_KIND(substr);
11228 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011229 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011230 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011231 len2 = PyUnicode_GET_LENGTH(substr);
11232 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011233 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011234 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011235 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011236 if (len2 == 1) {
11237 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11238 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011239 return result;
11240 }
11241 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011242 buf2 = _PyUnicode_AsKind(substr, kind1);
11243 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011244 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011245 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011246
Victor Stinner77282cb2013-04-14 19:22:47 +020011247 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248 case PyUnicode_1BYTE_KIND:
11249 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11250 break;
11251 case PyUnicode_2BYTE_KIND:
11252 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11253 break;
11254 case PyUnicode_4BYTE_KIND:
11255 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11256 break;
11257 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011258 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011259 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011260
Victor Stinner77282cb2013-04-14 19:22:47 +020011261 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011262 PyMem_Free(buf2);
11263
Guido van Rossum403d68b2000-03-13 15:55:09 +000011264 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011265}
11266
Guido van Rossumd57fd912000-03-10 22:53:23 +000011267/* Concat to string or Unicode object giving a new Unicode object. */
11268
Alexander Belopolsky40018472011-02-26 01:02:56 +000011269PyObject *
11270PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011271{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011272 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011273 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011274 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011275
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011276 if (ensure_unicode(left) < 0)
11277 return NULL;
11278
11279 if (!PyUnicode_Check(right)) {
11280 PyErr_Format(PyExc_TypeError,
11281 "can only concatenate str (not \"%.200s\") to str",
11282 right->ob_type->tp_name);
11283 return NULL;
11284 }
11285 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011286 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011287
11288 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011289 if (left == unicode_empty)
11290 return PyUnicode_FromObject(right);
11291 if (right == unicode_empty)
11292 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011293
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011294 left_len = PyUnicode_GET_LENGTH(left);
11295 right_len = PyUnicode_GET_LENGTH(right);
11296 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011297 PyErr_SetString(PyExc_OverflowError,
11298 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011299 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011300 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011302
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11304 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011305 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011306
Guido van Rossumd57fd912000-03-10 22:53:23 +000011307 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011308 result = PyUnicode_New(new_len, maxchar);
11309 if (result == NULL)
11310 return NULL;
11311 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11312 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11313 assert(_PyUnicode_CheckConsistency(result, 1));
11314 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011315}
11316
Walter Dörwald1ab83302007-05-18 17:15:44 +000011317void
Victor Stinner23e56682011-10-03 03:54:37 +020011318PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011319{
Victor Stinner23e56682011-10-03 03:54:37 +020011320 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011321 Py_UCS4 maxchar, maxchar2;
11322 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011323
11324 if (p_left == NULL) {
11325 if (!PyErr_Occurred())
11326 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011327 return;
11328 }
Victor Stinner23e56682011-10-03 03:54:37 +020011329 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011330 if (right == NULL || left == NULL
11331 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011332 if (!PyErr_Occurred())
11333 PyErr_BadInternalCall();
11334 goto error;
11335 }
11336
Benjamin Petersonbac79492012-01-14 13:34:47 -050011337 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011338 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011339 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011340 goto error;
11341
Victor Stinner488fa492011-12-12 00:01:39 +010011342 /* Shortcuts */
11343 if (left == unicode_empty) {
11344 Py_DECREF(left);
11345 Py_INCREF(right);
11346 *p_left = right;
11347 return;
11348 }
11349 if (right == unicode_empty)
11350 return;
11351
11352 left_len = PyUnicode_GET_LENGTH(left);
11353 right_len = PyUnicode_GET_LENGTH(right);
11354 if (left_len > PY_SSIZE_T_MAX - right_len) {
11355 PyErr_SetString(PyExc_OverflowError,
11356 "strings are too large to concat");
11357 goto error;
11358 }
11359 new_len = left_len + right_len;
11360
11361 if (unicode_modifiable(left)
11362 && PyUnicode_CheckExact(right)
11363 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011364 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11365 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011366 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011367 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011368 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11369 {
11370 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011371 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011372 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011373
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011374 /* copy 'right' into the newly allocated area of 'left' */
11375 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011376 }
Victor Stinner488fa492011-12-12 00:01:39 +010011377 else {
11378 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11379 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011380 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011381
Victor Stinner488fa492011-12-12 00:01:39 +010011382 /* Concat the two Unicode strings */
11383 res = PyUnicode_New(new_len, maxchar);
11384 if (res == NULL)
11385 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011386 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11387 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011388 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011389 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011390 }
11391 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011392 return;
11393
11394error:
Victor Stinner488fa492011-12-12 00:01:39 +010011395 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011396}
11397
11398void
11399PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11400{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011401 PyUnicode_Append(pleft, right);
11402 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011403}
11404
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011405/*
11406Wraps stringlib_parse_args_finds() and additionally ensures that the
11407first argument is a unicode object.
11408*/
11409
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011410static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011411parse_args_finds_unicode(const char * function_name, PyObject *args,
11412 PyObject **substring,
11413 Py_ssize_t *start, Py_ssize_t *end)
11414{
11415 if(stringlib_parse_args_finds(function_name, args, substring,
11416 start, end)) {
11417 if (ensure_unicode(*substring) < 0)
11418 return 0;
11419 return 1;
11420 }
11421 return 0;
11422}
11423
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011424PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011425 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011426\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011427Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011428string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011429interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011430
11431static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011432unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011434 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011435 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011436 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011437 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011438 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011439 void *buf1, *buf2;
11440 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011442 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011443 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011444
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011445 kind1 = PyUnicode_KIND(self);
11446 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011447 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011448 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011450 len1 = PyUnicode_GET_LENGTH(self);
11451 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011453 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011454 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011456 buf1 = PyUnicode_DATA(self);
11457 buf2 = PyUnicode_DATA(substring);
11458 if (kind2 != kind1) {
11459 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011460 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011461 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011462 }
11463 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011464 case PyUnicode_1BYTE_KIND:
11465 iresult = ucs1lib_count(
11466 ((Py_UCS1*)buf1) + start, end - start,
11467 buf2, len2, PY_SSIZE_T_MAX
11468 );
11469 break;
11470 case PyUnicode_2BYTE_KIND:
11471 iresult = ucs2lib_count(
11472 ((Py_UCS2*)buf1) + start, end - start,
11473 buf2, len2, PY_SSIZE_T_MAX
11474 );
11475 break;
11476 case PyUnicode_4BYTE_KIND:
11477 iresult = ucs4lib_count(
11478 ((Py_UCS4*)buf1) + start, end - start,
11479 buf2, len2, PY_SSIZE_T_MAX
11480 );
11481 break;
11482 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011483 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011484 }
11485
11486 result = PyLong_FromSsize_t(iresult);
11487
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011488 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011489 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011490
Guido van Rossumd57fd912000-03-10 22:53:23 +000011491 return result;
11492}
11493
INADA Naoki3ae20562017-01-16 20:41:20 +090011494/*[clinic input]
11495str.encode as unicode_encode
11496
11497 encoding: str(c_default="NULL") = 'utf-8'
11498 The encoding in which to encode the string.
11499 errors: str(c_default="NULL") = 'strict'
11500 The error handling scheme to use for encoding errors.
11501 The default is 'strict' meaning that encoding errors raise a
11502 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11503 'xmlcharrefreplace' as well as any other name registered with
11504 codecs.register_error that can handle UnicodeEncodeErrors.
11505
11506Encode the string using the codec registered for encoding.
11507[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011508
11509static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011510unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011511/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011512{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011513 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011514}
11515
INADA Naoki3ae20562017-01-16 20:41:20 +090011516/*[clinic input]
11517str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011518
INADA Naoki3ae20562017-01-16 20:41:20 +090011519 tabsize: int = 8
11520
11521Return a copy where all tab characters are expanded using spaces.
11522
11523If tabsize is not given, a tab size of 8 characters is assumed.
11524[clinic start generated code]*/
11525
11526static PyObject *
11527unicode_expandtabs_impl(PyObject *self, int tabsize)
11528/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011529{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011530 Py_ssize_t i, j, line_pos, src_len, incr;
11531 Py_UCS4 ch;
11532 PyObject *u;
11533 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011534 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011535 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011536
Antoine Pitrou22425222011-10-04 19:10:51 +020011537 if (PyUnicode_READY(self) == -1)
11538 return NULL;
11539
Thomas Wouters7e474022000-07-16 12:04:32 +000011540 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011541 src_len = PyUnicode_GET_LENGTH(self);
11542 i = j = line_pos = 0;
11543 kind = PyUnicode_KIND(self);
11544 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011545 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011546 for (; i < src_len; i++) {
11547 ch = PyUnicode_READ(kind, src_data, i);
11548 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011549 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011550 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011551 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011553 goto overflow;
11554 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011555 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011556 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011558 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011560 goto overflow;
11561 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011562 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011563 if (ch == '\n' || ch == '\r')
11564 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011565 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011566 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011567 if (!found)
11568 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011569
Guido van Rossumd57fd912000-03-10 22:53:23 +000011570 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011571 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 if (!u)
11573 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011574 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011575
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Antoine Pitroue71d5742011-10-04 15:55:09 +020011578 for (; i < src_len; i++) {
11579 ch = PyUnicode_READ(kind, src_data, i);
11580 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011581 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011582 incr = tabsize - (line_pos % tabsize);
11583 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011584 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011585 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011586 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011587 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011589 line_pos++;
11590 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011591 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011592 if (ch == '\n' || ch == '\r')
11593 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011594 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011595 }
11596 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011597 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011598
Antoine Pitroue71d5742011-10-04 15:55:09 +020011599 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011600 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11601 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011602}
11603
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011604PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011605 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011606\n\
11607Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011608such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011609arguments start and end are interpreted as in slice notation.\n\
11610\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011611Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011612
11613static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011614unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011615{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011616 /* initialize variables to prevent gcc warning */
11617 PyObject *substring = NULL;
11618 Py_ssize_t start = 0;
11619 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011620 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011621
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011622 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011624
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011625 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011626 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011627
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011628 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011629
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011630 if (result == -2)
11631 return NULL;
11632
Christian Heimes217cfd12007-12-02 14:31:20 +000011633 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634}
11635
11636static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011637unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011639 void *data;
11640 enum PyUnicode_Kind kind;
11641 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011642
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011643 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011645 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011646 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011647 if (PyUnicode_READY(self) == -1) {
11648 return NULL;
11649 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011650 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11651 PyErr_SetString(PyExc_IndexError, "string index out of range");
11652 return NULL;
11653 }
11654 kind = PyUnicode_KIND(self);
11655 data = PyUnicode_DATA(self);
11656 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011657 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011658}
11659
Guido van Rossumc2504932007-09-18 19:42:40 +000011660/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011661 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011662static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011663unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011664{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011665 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011666
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011667#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011668 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011669#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011670 if (_PyUnicode_HASH(self) != -1)
11671 return _PyUnicode_HASH(self);
11672 if (PyUnicode_READY(self) == -1)
11673 return -1;
animalizea1d14252019-01-02 20:16:06 +080011674
Christian Heimes985ecdc2013-11-20 11:46:18 +010011675 x = _Py_HashBytes(PyUnicode_DATA(self),
11676 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011677 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011678 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011679}
11680
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011681PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011682 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011683\n\
oldkaa0735f2018-02-02 16:52:55 +080011684Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011685such that sub is contained within S[start:end]. Optional\n\
11686arguments start and end are interpreted as in slice notation.\n\
11687\n\
11688Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011689
11690static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011691unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011692{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011693 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011694 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011695 PyObject *substring = NULL;
11696 Py_ssize_t start = 0;
11697 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011699 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011702 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011703 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011704
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011705 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011706
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011707 if (result == -2)
11708 return NULL;
11709
Guido van Rossumd57fd912000-03-10 22:53:23 +000011710 if (result < 0) {
11711 PyErr_SetString(PyExc_ValueError, "substring not found");
11712 return NULL;
11713 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011714
Christian Heimes217cfd12007-12-02 14:31:20 +000011715 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011716}
11717
INADA Naoki3ae20562017-01-16 20:41:20 +090011718/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011719str.isascii as unicode_isascii
11720
11721Return True if all characters in the string are ASCII, False otherwise.
11722
11723ASCII characters have code points in the range U+0000-U+007F.
11724Empty string is ASCII too.
11725[clinic start generated code]*/
11726
11727static PyObject *
11728unicode_isascii_impl(PyObject *self)
11729/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11730{
11731 if (PyUnicode_READY(self) == -1) {
11732 return NULL;
11733 }
11734 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11735}
11736
11737/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011738str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011739
INADA Naoki3ae20562017-01-16 20:41:20 +090011740Return True if the string is a lowercase string, False otherwise.
11741
11742A string is lowercase if all cased characters in the string are lowercase and
11743there is at least one cased character in the string.
11744[clinic start generated code]*/
11745
11746static PyObject *
11747unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011748/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011750 Py_ssize_t i, length;
11751 int kind;
11752 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011753 int cased;
11754
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011755 if (PyUnicode_READY(self) == -1)
11756 return NULL;
11757 length = PyUnicode_GET_LENGTH(self);
11758 kind = PyUnicode_KIND(self);
11759 data = PyUnicode_DATA(self);
11760
Guido van Rossumd57fd912000-03-10 22:53:23 +000011761 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011762 if (length == 1)
11763 return PyBool_FromLong(
11764 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011765
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011766 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011767 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011768 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011769
Guido van Rossumd57fd912000-03-10 22:53:23 +000011770 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 for (i = 0; i < length; i++) {
11772 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011773
Benjamin Peterson29060642009-01-31 22:14:21 +000011774 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011775 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 else if (!cased && Py_UNICODE_ISLOWER(ch))
11777 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011778 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011779 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780}
11781
INADA Naoki3ae20562017-01-16 20:41:20 +090011782/*[clinic input]
11783str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784
INADA Naoki3ae20562017-01-16 20:41:20 +090011785Return True if the string is an uppercase string, False otherwise.
11786
11787A string is uppercase if all cased characters in the string are uppercase and
11788there is at least one cased character in the string.
11789[clinic start generated code]*/
11790
11791static PyObject *
11792unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011793/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011794{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011795 Py_ssize_t i, length;
11796 int kind;
11797 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011798 int cased;
11799
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011800 if (PyUnicode_READY(self) == -1)
11801 return NULL;
11802 length = PyUnicode_GET_LENGTH(self);
11803 kind = PyUnicode_KIND(self);
11804 data = PyUnicode_DATA(self);
11805
Guido van Rossumd57fd912000-03-10 22:53:23 +000011806 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011807 if (length == 1)
11808 return PyBool_FromLong(
11809 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011810
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011811 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011812 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011813 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011814
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011816 for (i = 0; i < length; i++) {
11817 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011818
Benjamin Peterson29060642009-01-31 22:14:21 +000011819 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011820 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 else if (!cased && Py_UNICODE_ISUPPER(ch))
11822 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011823 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011824 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825}
11826
INADA Naoki3ae20562017-01-16 20:41:20 +090011827/*[clinic input]
11828str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011829
INADA Naoki3ae20562017-01-16 20:41:20 +090011830Return True if the string is a title-cased string, False otherwise.
11831
11832In a title-cased string, upper- and title-case characters may only
11833follow uncased characters and lowercase characters only cased ones.
11834[clinic start generated code]*/
11835
11836static PyObject *
11837unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011838/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011839{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011840 Py_ssize_t i, length;
11841 int kind;
11842 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011843 int cased, previous_is_cased;
11844
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011845 if (PyUnicode_READY(self) == -1)
11846 return NULL;
11847 length = PyUnicode_GET_LENGTH(self);
11848 kind = PyUnicode_KIND(self);
11849 data = PyUnicode_DATA(self);
11850
Guido van Rossumd57fd912000-03-10 22:53:23 +000011851 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011852 if (length == 1) {
11853 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11854 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11855 (Py_UNICODE_ISUPPER(ch) != 0));
11856 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011857
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011858 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011859 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011860 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011861
Guido van Rossumd57fd912000-03-10 22:53:23 +000011862 cased = 0;
11863 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011864 for (i = 0; i < length; i++) {
11865 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011866
Benjamin Peterson29060642009-01-31 22:14:21 +000011867 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11868 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011869 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011870 previous_is_cased = 1;
11871 cased = 1;
11872 }
11873 else if (Py_UNICODE_ISLOWER(ch)) {
11874 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011875 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011876 previous_is_cased = 1;
11877 cased = 1;
11878 }
11879 else
11880 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011881 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011882 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883}
11884
INADA Naoki3ae20562017-01-16 20:41:20 +090011885/*[clinic input]
11886str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887
INADA Naoki3ae20562017-01-16 20:41:20 +090011888Return True if the string is a whitespace string, False otherwise.
11889
11890A string is whitespace if all characters in the string are whitespace and there
11891is at least one character in the string.
11892[clinic start generated code]*/
11893
11894static PyObject *
11895unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011896/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011897{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011898 Py_ssize_t i, length;
11899 int kind;
11900 void *data;
11901
11902 if (PyUnicode_READY(self) == -1)
11903 return NULL;
11904 length = PyUnicode_GET_LENGTH(self);
11905 kind = PyUnicode_KIND(self);
11906 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011907
Guido van Rossumd57fd912000-03-10 22:53:23 +000011908 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011909 if (length == 1)
11910 return PyBool_FromLong(
11911 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011912
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011913 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011914 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011915 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011916
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011917 for (i = 0; i < length; i++) {
11918 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011919 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011920 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011921 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011922 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923}
11924
INADA Naoki3ae20562017-01-16 20:41:20 +090011925/*[clinic input]
11926str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011927
INADA Naoki3ae20562017-01-16 20:41:20 +090011928Return True if the string is an alphabetic string, False otherwise.
11929
11930A string is alphabetic if all characters in the string are alphabetic and there
11931is at least one character in the string.
11932[clinic start generated code]*/
11933
11934static PyObject *
11935unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011936/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011937{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 Py_ssize_t i, length;
11939 int kind;
11940 void *data;
11941
11942 if (PyUnicode_READY(self) == -1)
11943 return NULL;
11944 length = PyUnicode_GET_LENGTH(self);
11945 kind = PyUnicode_KIND(self);
11946 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011947
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011948 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011949 if (length == 1)
11950 return PyBool_FromLong(
11951 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011952
11953 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011954 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011955 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011956
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011957 for (i = 0; i < length; i++) {
11958 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011959 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011960 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011961 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011962}
11963
INADA Naoki3ae20562017-01-16 20:41:20 +090011964/*[clinic input]
11965str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011966
INADA Naoki3ae20562017-01-16 20:41:20 +090011967Return True if the string is an alpha-numeric string, False otherwise.
11968
11969A string is alpha-numeric if all characters in the string are alpha-numeric and
11970there is at least one character in the string.
11971[clinic start generated code]*/
11972
11973static PyObject *
11974unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011975/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011976{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011977 int kind;
11978 void *data;
11979 Py_ssize_t len, i;
11980
11981 if (PyUnicode_READY(self) == -1)
11982 return NULL;
11983
11984 kind = PyUnicode_KIND(self);
11985 data = PyUnicode_DATA(self);
11986 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011987
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011988 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011989 if (len == 1) {
11990 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11991 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11992 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011993
11994 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011995 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011996 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011997
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011998 for (i = 0; i < len; i++) {
11999 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012000 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012001 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012002 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012003 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012004}
12005
INADA Naoki3ae20562017-01-16 20:41:20 +090012006/*[clinic input]
12007str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012008
INADA Naoki3ae20562017-01-16 20:41:20 +090012009Return True if the string is a decimal string, False otherwise.
12010
12011A string is a decimal string if all characters in the string are decimal and
12012there is at least one character in the string.
12013[clinic start generated code]*/
12014
12015static PyObject *
12016unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012017/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012018{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012019 Py_ssize_t i, length;
12020 int kind;
12021 void *data;
12022
12023 if (PyUnicode_READY(self) == -1)
12024 return NULL;
12025 length = PyUnicode_GET_LENGTH(self);
12026 kind = PyUnicode_KIND(self);
12027 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012028
Guido van Rossumd57fd912000-03-10 22:53:23 +000012029 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012030 if (length == 1)
12031 return PyBool_FromLong(
12032 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012034 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012036 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012037
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012038 for (i = 0; i < length; i++) {
12039 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012040 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012041 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012042 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043}
12044
INADA Naoki3ae20562017-01-16 20:41:20 +090012045/*[clinic input]
12046str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047
INADA Naoki3ae20562017-01-16 20:41:20 +090012048Return True if the string is a digit string, False otherwise.
12049
12050A string is a digit string if all characters in the string are digits and there
12051is at least one character in the string.
12052[clinic start generated code]*/
12053
12054static PyObject *
12055unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012056/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012058 Py_ssize_t i, length;
12059 int kind;
12060 void *data;
12061
12062 if (PyUnicode_READY(self) == -1)
12063 return NULL;
12064 length = PyUnicode_GET_LENGTH(self);
12065 kind = PyUnicode_KIND(self);
12066 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012067
Guido van Rossumd57fd912000-03-10 22:53:23 +000012068 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012069 if (length == 1) {
12070 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12071 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12072 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012073
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012074 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012076 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012077
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012078 for (i = 0; i < length; i++) {
12079 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012080 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012081 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012082 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083}
12084
INADA Naoki3ae20562017-01-16 20:41:20 +090012085/*[clinic input]
12086str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012087
INADA Naoki3ae20562017-01-16 20:41:20 +090012088Return True if the string is a numeric string, False otherwise.
12089
12090A string is numeric if all characters in the string are numeric and there is at
12091least one character in the string.
12092[clinic start generated code]*/
12093
12094static PyObject *
12095unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012096/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012097{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012098 Py_ssize_t i, length;
12099 int kind;
12100 void *data;
12101
12102 if (PyUnicode_READY(self) == -1)
12103 return NULL;
12104 length = PyUnicode_GET_LENGTH(self);
12105 kind = PyUnicode_KIND(self);
12106 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012107
Guido van Rossumd57fd912000-03-10 22:53:23 +000012108 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012109 if (length == 1)
12110 return PyBool_FromLong(
12111 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012112
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012113 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012114 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012115 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012116
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012117 for (i = 0; i < length; i++) {
12118 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012119 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012120 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012121 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122}
12123
Martin v. Löwis47383402007-08-15 07:32:56 +000012124int
12125PyUnicode_IsIdentifier(PyObject *self)
12126{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012127 int kind;
12128 void *data;
12129 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012130 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012131
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012132 if (PyUnicode_READY(self) == -1) {
12133 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012134 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012135 }
12136
12137 /* Special case for empty strings */
12138 if (PyUnicode_GET_LENGTH(self) == 0)
12139 return 0;
12140 kind = PyUnicode_KIND(self);
12141 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012142
12143 /* PEP 3131 says that the first character must be in
12144 XID_Start and subsequent characters in XID_Continue,
12145 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012146 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012147 letters, digits, underscore). However, given the current
12148 definition of XID_Start and XID_Continue, it is sufficient
12149 to check just for these, except that _ must be allowed
12150 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012151 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012152 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012153 return 0;
12154
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012155 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012157 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012158 return 1;
12159}
12160
INADA Naoki3ae20562017-01-16 20:41:20 +090012161/*[clinic input]
12162str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012163
INADA Naoki3ae20562017-01-16 20:41:20 +090012164Return True if the string is a valid Python identifier, False otherwise.
12165
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012166Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012167such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012168[clinic start generated code]*/
12169
12170static PyObject *
12171unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012172/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012173{
12174 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12175}
12176
INADA Naoki3ae20562017-01-16 20:41:20 +090012177/*[clinic input]
12178str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012179
INADA Naoki3ae20562017-01-16 20:41:20 +090012180Return True if the string is printable, False otherwise.
12181
12182A string is printable if all of its characters are considered printable in
12183repr() or if it is empty.
12184[clinic start generated code]*/
12185
12186static PyObject *
12187unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012188/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012189{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012190 Py_ssize_t i, length;
12191 int kind;
12192 void *data;
12193
12194 if (PyUnicode_READY(self) == -1)
12195 return NULL;
12196 length = PyUnicode_GET_LENGTH(self);
12197 kind = PyUnicode_KIND(self);
12198 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012199
12200 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (length == 1)
12202 return PyBool_FromLong(
12203 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012204
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012205 for (i = 0; i < length; i++) {
12206 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012207 Py_RETURN_FALSE;
12208 }
12209 }
12210 Py_RETURN_TRUE;
12211}
12212
INADA Naoki3ae20562017-01-16 20:41:20 +090012213/*[clinic input]
12214str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012215
INADA Naoki3ae20562017-01-16 20:41:20 +090012216 iterable: object
12217 /
12218
12219Concatenate any number of strings.
12220
Martin Panter91a88662017-01-24 00:30:06 +000012221The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012222The result is returned as a new string.
12223
12224Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12225[clinic start generated code]*/
12226
12227static PyObject *
12228unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012229/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012230{
INADA Naoki3ae20562017-01-16 20:41:20 +090012231 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232}
12233
Martin v. Löwis18e16552006-02-15 17:27:45 +000012234static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012235unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012236{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012237 if (PyUnicode_READY(self) == -1)
12238 return -1;
12239 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012240}
12241
INADA Naoki3ae20562017-01-16 20:41:20 +090012242/*[clinic input]
12243str.ljust as unicode_ljust
12244
12245 width: Py_ssize_t
12246 fillchar: Py_UCS4 = ' '
12247 /
12248
12249Return a left-justified string of length width.
12250
12251Padding is done using the specified fill character (default is a space).
12252[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012253
12254static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012255unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12256/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012257{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012258 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012259 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012260
Victor Stinnerc4b49542011-12-11 22:44:26 +010012261 if (PyUnicode_GET_LENGTH(self) >= width)
12262 return unicode_result_unchanged(self);
12263
12264 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012265}
12266
INADA Naoki3ae20562017-01-16 20:41:20 +090012267/*[clinic input]
12268str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012269
INADA Naoki3ae20562017-01-16 20:41:20 +090012270Return a copy of the string converted to lowercase.
12271[clinic start generated code]*/
12272
12273static PyObject *
12274unicode_lower_impl(PyObject *self)
12275/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012276{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012277 if (PyUnicode_READY(self) == -1)
12278 return NULL;
12279 if (PyUnicode_IS_ASCII(self))
12280 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012281 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012282}
12283
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012284#define LEFTSTRIP 0
12285#define RIGHTSTRIP 1
12286#define BOTHSTRIP 2
12287
12288/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012289static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012290
INADA Naoki3ae20562017-01-16 20:41:20 +090012291#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012292
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012293/* externally visible for str.strip(unicode) */
12294PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012295_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012296{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012297 void *data;
12298 int kind;
12299 Py_ssize_t i, j, len;
12300 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012301 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012303 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12304 return NULL;
12305
12306 kind = PyUnicode_KIND(self);
12307 data = PyUnicode_DATA(self);
12308 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012309 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012310 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12311 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012312 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012313
Benjamin Peterson14339b62009-01-31 16:36:08 +000012314 i = 0;
12315 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012316 while (i < len) {
12317 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12318 if (!BLOOM(sepmask, ch))
12319 break;
12320 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12321 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012322 i++;
12323 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012324 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012325
Benjamin Peterson14339b62009-01-31 16:36:08 +000012326 j = len;
12327 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012328 j--;
12329 while (j >= i) {
12330 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12331 if (!BLOOM(sepmask, ch))
12332 break;
12333 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12334 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012335 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012336 }
12337
Benjamin Peterson29060642009-01-31 22:14:21 +000012338 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012339 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012340
Victor Stinner7931d9a2011-11-04 00:22:48 +010012341 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012342}
12343
12344PyObject*
12345PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12346{
12347 unsigned char *data;
12348 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012349 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012350
Victor Stinnerde636f32011-10-01 03:55:54 +020012351 if (PyUnicode_READY(self) == -1)
12352 return NULL;
12353
Victor Stinner684d5fd2012-05-03 02:32:34 +020012354 length = PyUnicode_GET_LENGTH(self);
12355 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012356
Victor Stinner684d5fd2012-05-03 02:32:34 +020012357 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012358 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012359
Victor Stinnerde636f32011-10-01 03:55:54 +020012360 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012361 PyErr_SetString(PyExc_IndexError, "string index out of range");
12362 return NULL;
12363 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012364 if (start >= length || end < start)
12365 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012366
Victor Stinner684d5fd2012-05-03 02:32:34 +020012367 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012368 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012369 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012370 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012371 }
12372 else {
12373 kind = PyUnicode_KIND(self);
12374 data = PyUnicode_1BYTE_DATA(self);
12375 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012376 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012377 length);
12378 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012379}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380
12381static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012382do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012383{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012384 Py_ssize_t len, i, j;
12385
12386 if (PyUnicode_READY(self) == -1)
12387 return NULL;
12388
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012389 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012390
Victor Stinnercc7af722013-04-09 22:39:24 +020012391 if (PyUnicode_IS_ASCII(self)) {
12392 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12393
12394 i = 0;
12395 if (striptype != RIGHTSTRIP) {
12396 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012397 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012398 if (!_Py_ascii_whitespace[ch])
12399 break;
12400 i++;
12401 }
12402 }
12403
12404 j = len;
12405 if (striptype != LEFTSTRIP) {
12406 j--;
12407 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012408 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012409 if (!_Py_ascii_whitespace[ch])
12410 break;
12411 j--;
12412 }
12413 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012414 }
12415 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012416 else {
12417 int kind = PyUnicode_KIND(self);
12418 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012419
Victor Stinnercc7af722013-04-09 22:39:24 +020012420 i = 0;
12421 if (striptype != RIGHTSTRIP) {
12422 while (i < len) {
12423 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12424 if (!Py_UNICODE_ISSPACE(ch))
12425 break;
12426 i++;
12427 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012428 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012429
12430 j = len;
12431 if (striptype != LEFTSTRIP) {
12432 j--;
12433 while (j >= i) {
12434 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12435 if (!Py_UNICODE_ISSPACE(ch))
12436 break;
12437 j--;
12438 }
12439 j++;
12440 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012441 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012442
Victor Stinner7931d9a2011-11-04 00:22:48 +010012443 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012444}
12445
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012446
12447static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012448do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012449{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012450 if (sep != NULL && sep != Py_None) {
12451 if (PyUnicode_Check(sep))
12452 return _PyUnicode_XStrip(self, striptype, sep);
12453 else {
12454 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012455 "%s arg must be None or str",
12456 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012457 return NULL;
12458 }
12459 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012460
Benjamin Peterson14339b62009-01-31 16:36:08 +000012461 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012462}
12463
12464
INADA Naoki3ae20562017-01-16 20:41:20 +090012465/*[clinic input]
12466str.strip as unicode_strip
12467
12468 chars: object = None
12469 /
12470
Victor Stinner0c4a8282017-01-17 02:21:47 +010012471Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012472
12473If chars is given and not None, remove characters in chars instead.
12474[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012475
12476static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012477unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012478/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012479{
INADA Naoki3ae20562017-01-16 20:41:20 +090012480 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012481}
12482
12483
INADA Naoki3ae20562017-01-16 20:41:20 +090012484/*[clinic input]
12485str.lstrip as unicode_lstrip
12486
12487 chars: object = NULL
12488 /
12489
12490Return a copy of the string with leading whitespace removed.
12491
12492If chars is given and not None, remove characters in chars instead.
12493[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012494
12495static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012496unicode_lstrip_impl(PyObject *self, PyObject *chars)
12497/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012498{
INADA Naoki3ae20562017-01-16 20:41:20 +090012499 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012500}
12501
12502
INADA Naoki3ae20562017-01-16 20:41:20 +090012503/*[clinic input]
12504str.rstrip as unicode_rstrip
12505
12506 chars: object = NULL
12507 /
12508
12509Return a copy of the string with trailing whitespace removed.
12510
12511If chars is given and not None, remove characters in chars instead.
12512[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012513
12514static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012515unicode_rstrip_impl(PyObject *self, PyObject *chars)
12516/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012517{
INADA Naoki3ae20562017-01-16 20:41:20 +090012518 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012519}
12520
12521
Guido van Rossumd57fd912000-03-10 22:53:23 +000012522static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012523unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012525 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012526 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012527
Serhiy Storchaka05997252013-01-26 12:14:02 +020012528 if (len < 1)
12529 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012530
Victor Stinnerc4b49542011-12-11 22:44:26 +010012531 /* no repeat, return original string */
12532 if (len == 1)
12533 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012534
Benjamin Petersonbac79492012-01-14 13:34:47 -050012535 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012536 return NULL;
12537
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012538 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012539 PyErr_SetString(PyExc_OverflowError,
12540 "repeated string is too long");
12541 return NULL;
12542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012543 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012544
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012545 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012546 if (!u)
12547 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012548 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012549
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012550 if (PyUnicode_GET_LENGTH(str) == 1) {
12551 const int kind = PyUnicode_KIND(str);
12552 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012553 if (kind == PyUnicode_1BYTE_KIND) {
12554 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012555 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012556 }
12557 else if (kind == PyUnicode_2BYTE_KIND) {
12558 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012559 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012560 ucs2[n] = fill_char;
12561 } else {
12562 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12563 assert(kind == PyUnicode_4BYTE_KIND);
12564 for (n = 0; n < len; ++n)
12565 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012566 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012567 }
12568 else {
12569 /* number of characters copied this far */
12570 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012571 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012572 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012573 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012575 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012577 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012578 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012579 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012580 }
12581
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012582 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012583 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584}
12585
Alexander Belopolsky40018472011-02-26 01:02:56 +000012586PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012587PyUnicode_Replace(PyObject *str,
12588 PyObject *substr,
12589 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012590 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012591{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012592 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12593 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012594 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012595 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012596}
12597
INADA Naoki3ae20562017-01-16 20:41:20 +090012598/*[clinic input]
12599str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012600
INADA Naoki3ae20562017-01-16 20:41:20 +090012601 old: unicode
12602 new: unicode
12603 count: Py_ssize_t = -1
12604 Maximum number of occurrences to replace.
12605 -1 (the default value) means replace all occurrences.
12606 /
12607
12608Return a copy with all occurrences of substring old replaced by new.
12609
12610If the optional argument count is given, only the first count occurrences are
12611replaced.
12612[clinic start generated code]*/
12613
12614static PyObject *
12615unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12616 Py_ssize_t count)
12617/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012618{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012619 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012620 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012621 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012622}
12623
Alexander Belopolsky40018472011-02-26 01:02:56 +000012624static PyObject *
12625unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012626{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012627 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012628 Py_ssize_t isize;
12629 Py_ssize_t osize, squote, dquote, i, o;
12630 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012631 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012632 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012635 return NULL;
12636
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012637 isize = PyUnicode_GET_LENGTH(unicode);
12638 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012639
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012640 /* Compute length of output, quote characters, and
12641 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012642 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012643 max = 127;
12644 squote = dquote = 0;
12645 ikind = PyUnicode_KIND(unicode);
12646 for (i = 0; i < isize; i++) {
12647 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012648 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012649 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012650 case '\'': squote++; break;
12651 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012652 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012653 incr = 2;
12654 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012655 default:
12656 /* Fast-path ASCII */
12657 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012658 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012659 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012660 ;
12661 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012662 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012663 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012664 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012666 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012668 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012670 if (osize > PY_SSIZE_T_MAX - incr) {
12671 PyErr_SetString(PyExc_OverflowError,
12672 "string is too long to generate repr");
12673 return NULL;
12674 }
12675 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 }
12677
12678 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012679 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012680 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012681 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 if (dquote)
12683 /* Both squote and dquote present. Use squote,
12684 and escape them */
12685 osize += squote;
12686 else
12687 quote = '"';
12688 }
Victor Stinner55c08782013-04-14 18:45:39 +020012689 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690
12691 repr = PyUnicode_New(osize, max);
12692 if (repr == NULL)
12693 return NULL;
12694 okind = PyUnicode_KIND(repr);
12695 odata = PyUnicode_DATA(repr);
12696
12697 PyUnicode_WRITE(okind, odata, 0, quote);
12698 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012699 if (unchanged) {
12700 _PyUnicode_FastCopyCharacters(repr, 1,
12701 unicode, 0,
12702 isize);
12703 }
12704 else {
12705 for (i = 0, o = 1; i < isize; i++) {
12706 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707
Victor Stinner55c08782013-04-14 18:45:39 +020012708 /* Escape quotes and backslashes */
12709 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012710 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012711 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012712 continue;
12713 }
12714
12715 /* Map special whitespace to '\t', \n', '\r' */
12716 if (ch == '\t') {
12717 PyUnicode_WRITE(okind, odata, o++, '\\');
12718 PyUnicode_WRITE(okind, odata, o++, 't');
12719 }
12720 else if (ch == '\n') {
12721 PyUnicode_WRITE(okind, odata, o++, '\\');
12722 PyUnicode_WRITE(okind, odata, o++, 'n');
12723 }
12724 else if (ch == '\r') {
12725 PyUnicode_WRITE(okind, odata, o++, '\\');
12726 PyUnicode_WRITE(okind, odata, o++, 'r');
12727 }
12728
12729 /* Map non-printable US ASCII to '\xhh' */
12730 else if (ch < ' ' || ch == 0x7F) {
12731 PyUnicode_WRITE(okind, odata, o++, '\\');
12732 PyUnicode_WRITE(okind, odata, o++, 'x');
12733 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12734 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12735 }
12736
12737 /* Copy ASCII characters as-is */
12738 else if (ch < 0x7F) {
12739 PyUnicode_WRITE(okind, odata, o++, ch);
12740 }
12741
12742 /* Non-ASCII characters */
12743 else {
12744 /* Map Unicode whitespace and control characters
12745 (categories Z* and C* except ASCII space)
12746 */
12747 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12748 PyUnicode_WRITE(okind, odata, o++, '\\');
12749 /* Map 8-bit characters to '\xhh' */
12750 if (ch <= 0xff) {
12751 PyUnicode_WRITE(okind, odata, o++, 'x');
12752 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12753 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12754 }
12755 /* Map 16-bit characters to '\uxxxx' */
12756 else if (ch <= 0xffff) {
12757 PyUnicode_WRITE(okind, odata, o++, 'u');
12758 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12759 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12762 }
12763 /* Map 21-bit characters to '\U00xxxxxx' */
12764 else {
12765 PyUnicode_WRITE(okind, odata, o++, 'U');
12766 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12767 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12774 }
12775 }
12776 /* Copy characters as-is */
12777 else {
12778 PyUnicode_WRITE(okind, odata, o++, ch);
12779 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012780 }
12781 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012782 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012784 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012785 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012786}
12787
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012788PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012789 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012790\n\
12791Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012792such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012793arguments start and end are interpreted as in slice notation.\n\
12794\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012795Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012796
12797static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012798unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012799{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012800 /* initialize variables to prevent gcc warning */
12801 PyObject *substring = NULL;
12802 Py_ssize_t start = 0;
12803 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012804 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012805
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012806 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012807 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012808
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012809 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012810 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012811
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012812 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012813
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012814 if (result == -2)
12815 return NULL;
12816
Christian Heimes217cfd12007-12-02 14:31:20 +000012817 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012818}
12819
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012820PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012821 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012822\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012823Return the highest index in S where substring sub is found,\n\
12824such that sub is contained within S[start:end]. Optional\n\
12825arguments start and end are interpreted as in slice notation.\n\
12826\n\
12827Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012828
12829static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012831{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012832 /* initialize variables to prevent gcc warning */
12833 PyObject *substring = NULL;
12834 Py_ssize_t start = 0;
12835 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012836 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012837
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012838 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012839 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012840
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012841 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012842 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012843
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012844 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012845
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012846 if (result == -2)
12847 return NULL;
12848
Guido van Rossumd57fd912000-03-10 22:53:23 +000012849 if (result < 0) {
12850 PyErr_SetString(PyExc_ValueError, "substring not found");
12851 return NULL;
12852 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012853
Christian Heimes217cfd12007-12-02 14:31:20 +000012854 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012855}
12856
INADA Naoki3ae20562017-01-16 20:41:20 +090012857/*[clinic input]
12858str.rjust as unicode_rjust
12859
12860 width: Py_ssize_t
12861 fillchar: Py_UCS4 = ' '
12862 /
12863
12864Return a right-justified string of length width.
12865
12866Padding is done using the specified fill character (default is a space).
12867[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012868
12869static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012870unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12871/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012872{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012873 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874 return NULL;
12875
Victor Stinnerc4b49542011-12-11 22:44:26 +010012876 if (PyUnicode_GET_LENGTH(self) >= width)
12877 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012878
Victor Stinnerc4b49542011-12-11 22:44:26 +010012879 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880}
12881
Alexander Belopolsky40018472011-02-26 01:02:56 +000012882PyObject *
12883PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012884{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012885 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012886 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012887
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012888 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889}
12890
INADA Naoki3ae20562017-01-16 20:41:20 +090012891/*[clinic input]
12892str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012893
INADA Naoki3ae20562017-01-16 20:41:20 +090012894 sep: object = None
12895 The delimiter according which to split the string.
12896 None (the default value) means split according to any whitespace,
12897 and discard empty strings from the result.
12898 maxsplit: Py_ssize_t = -1
12899 Maximum number of splits to do.
12900 -1 (the default value) means no limit.
12901
12902Return a list of the words in the string, using sep as the delimiter string.
12903[clinic start generated code]*/
12904
12905static PyObject *
12906unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12907/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012908{
INADA Naoki3ae20562017-01-16 20:41:20 +090012909 if (sep == Py_None)
12910 return split(self, NULL, maxsplit);
12911 if (PyUnicode_Check(sep))
12912 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012913
Victor Stinner998b8062018-09-12 00:23:25 +020012914 PyErr_Format(PyExc_TypeError,
12915 "must be str or None, not %.100s",
12916 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012917 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012918}
12919
Thomas Wouters477c8d52006-05-27 19:21:47 +000012920PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012921PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012922{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012923 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012924 int kind1, kind2;
12925 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012926 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012927
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012928 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012929 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012930
Victor Stinner14f8f022011-10-05 20:58:25 +020012931 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012932 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012933 len1 = PyUnicode_GET_LENGTH(str_obj);
12934 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012935 if (kind1 < kind2 || len1 < len2) {
12936 _Py_INCREF_UNICODE_EMPTY();
12937 if (!unicode_empty)
12938 out = NULL;
12939 else {
12940 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12941 Py_DECREF(unicode_empty);
12942 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012943 return out;
12944 }
12945 buf1 = PyUnicode_DATA(str_obj);
12946 buf2 = PyUnicode_DATA(sep_obj);
12947 if (kind2 != kind1) {
12948 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12949 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012950 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012951 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012952
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012953 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012955 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12956 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12957 else
12958 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012959 break;
12960 case PyUnicode_2BYTE_KIND:
12961 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12962 break;
12963 case PyUnicode_4BYTE_KIND:
12964 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12965 break;
12966 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012967 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012968 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012969
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012970 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012971 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012972
12973 return out;
12974}
12975
12976
12977PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012978PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012979{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012980 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012981 int kind1, kind2;
12982 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012984
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012985 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012986 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012987
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012988 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012989 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012990 len1 = PyUnicode_GET_LENGTH(str_obj);
12991 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012992 if (kind1 < kind2 || len1 < len2) {
12993 _Py_INCREF_UNICODE_EMPTY();
12994 if (!unicode_empty)
12995 out = NULL;
12996 else {
12997 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12998 Py_DECREF(unicode_empty);
12999 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013000 return out;
13001 }
13002 buf1 = PyUnicode_DATA(str_obj);
13003 buf2 = PyUnicode_DATA(sep_obj);
13004 if (kind2 != kind1) {
13005 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13006 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013007 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013008 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013009
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013010 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013012 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13013 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13014 else
13015 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013016 break;
13017 case PyUnicode_2BYTE_KIND:
13018 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13019 break;
13020 case PyUnicode_4BYTE_KIND:
13021 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13022 break;
13023 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013024 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013025 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013026
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013027 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013028 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013029
13030 return out;
13031}
13032
INADA Naoki3ae20562017-01-16 20:41:20 +090013033/*[clinic input]
13034str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013035
INADA Naoki3ae20562017-01-16 20:41:20 +090013036 sep: object
13037 /
13038
13039Partition the string into three parts using the given separator.
13040
13041This will search for the separator in the string. If the separator is found,
13042returns a 3-tuple containing the part before the separator, the separator
13043itself, and the part after it.
13044
13045If the separator is not found, returns a 3-tuple containing the original string
13046and two empty strings.
13047[clinic start generated code]*/
13048
13049static PyObject *
13050unicode_partition(PyObject *self, PyObject *sep)
13051/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013052{
INADA Naoki3ae20562017-01-16 20:41:20 +090013053 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013054}
13055
INADA Naoki3ae20562017-01-16 20:41:20 +090013056/*[clinic input]
13057str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013058
INADA Naoki3ae20562017-01-16 20:41:20 +090013059Partition the string into three parts using the given separator.
13060
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013061This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013062the separator is found, returns a 3-tuple containing the part before the
13063separator, the separator itself, and the part after it.
13064
13065If the separator is not found, returns a 3-tuple containing two empty strings
13066and the original string.
13067[clinic start generated code]*/
13068
13069static PyObject *
13070unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013071/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013072{
INADA Naoki3ae20562017-01-16 20:41:20 +090013073 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013074}
13075
Alexander Belopolsky40018472011-02-26 01:02:56 +000013076PyObject *
13077PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013078{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013079 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013080 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013081
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013082 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013083}
13084
INADA Naoki3ae20562017-01-16 20:41:20 +090013085/*[clinic input]
13086str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013087
INADA Naoki3ae20562017-01-16 20:41:20 +090013088Return a list of the words in the string, using sep as the delimiter string.
13089
13090Splits are done starting at the end of the string and working to the front.
13091[clinic start generated code]*/
13092
13093static PyObject *
13094unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13095/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013096{
INADA Naoki3ae20562017-01-16 20:41:20 +090013097 if (sep == Py_None)
13098 return rsplit(self, NULL, maxsplit);
13099 if (PyUnicode_Check(sep))
13100 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013101
Victor Stinner998b8062018-09-12 00:23:25 +020013102 PyErr_Format(PyExc_TypeError,
13103 "must be str or None, not %.100s",
13104 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013105 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013106}
13107
INADA Naoki3ae20562017-01-16 20:41:20 +090013108/*[clinic input]
13109str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013110
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013111 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013112
13113Return a list of the lines in the string, breaking at line boundaries.
13114
13115Line breaks are not included in the resulting list unless keepends is given and
13116true.
13117[clinic start generated code]*/
13118
13119static PyObject *
13120unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013121/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013122{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013123 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124}
13125
13126static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013127PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013128{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013129 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130}
13131
INADA Naoki3ae20562017-01-16 20:41:20 +090013132/*[clinic input]
13133str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013134
INADA Naoki3ae20562017-01-16 20:41:20 +090013135Convert uppercase characters to lowercase and lowercase characters to uppercase.
13136[clinic start generated code]*/
13137
13138static PyObject *
13139unicode_swapcase_impl(PyObject *self)
13140/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013141{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013142 if (PyUnicode_READY(self) == -1)
13143 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013144 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013145}
13146
Larry Hastings61272b72014-01-07 12:41:53 -080013147/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013148
Larry Hastings31826802013-10-19 00:09:25 -070013149@staticmethod
13150str.maketrans as unicode_maketrans
13151
13152 x: object
13153
13154 y: unicode=NULL
13155
13156 z: unicode=NULL
13157
13158 /
13159
13160Return a translation table usable for str.translate().
13161
13162If there is only one argument, it must be a dictionary mapping Unicode
13163ordinals (integers) or characters to Unicode ordinals, strings or None.
13164Character keys will be then converted to ordinals.
13165If there are two arguments, they must be strings of equal length, and
13166in the resulting dictionary, each character in x will be mapped to the
13167character at the same position in y. If there is a third argument, it
13168must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013169[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013170
Larry Hastings31826802013-10-19 00:09:25 -070013171static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013172unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013173/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013174{
Georg Brandlceee0772007-11-27 23:48:05 +000013175 PyObject *new = NULL, *key, *value;
13176 Py_ssize_t i = 0;
13177 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013178
Georg Brandlceee0772007-11-27 23:48:05 +000013179 new = PyDict_New();
13180 if (!new)
13181 return NULL;
13182 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013183 int x_kind, y_kind, z_kind;
13184 void *x_data, *y_data, *z_data;
13185
Georg Brandlceee0772007-11-27 23:48:05 +000013186 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013187 if (!PyUnicode_Check(x)) {
13188 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13189 "be a string if there is a second argument");
13190 goto err;
13191 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013192 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013193 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13194 "arguments must have equal length");
13195 goto err;
13196 }
13197 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013198 x_kind = PyUnicode_KIND(x);
13199 y_kind = PyUnicode_KIND(y);
13200 x_data = PyUnicode_DATA(x);
13201 y_data = PyUnicode_DATA(y);
13202 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13203 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013204 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013205 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013206 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013207 if (!value) {
13208 Py_DECREF(key);
13209 goto err;
13210 }
Georg Brandlceee0772007-11-27 23:48:05 +000013211 res = PyDict_SetItem(new, key, value);
13212 Py_DECREF(key);
13213 Py_DECREF(value);
13214 if (res < 0)
13215 goto err;
13216 }
13217 /* create entries for deleting chars in z */
13218 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013219 z_kind = PyUnicode_KIND(z);
13220 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013221 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013222 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013223 if (!key)
13224 goto err;
13225 res = PyDict_SetItem(new, key, Py_None);
13226 Py_DECREF(key);
13227 if (res < 0)
13228 goto err;
13229 }
13230 }
13231 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013232 int kind;
13233 void *data;
13234
Georg Brandlceee0772007-11-27 23:48:05 +000013235 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013236 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013237 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13238 "to maketrans it must be a dict");
13239 goto err;
13240 }
13241 /* copy entries into the new dict, converting string keys to int keys */
13242 while (PyDict_Next(x, &i, &key, &value)) {
13243 if (PyUnicode_Check(key)) {
13244 /* convert string keys to integer keys */
13245 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013246 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013247 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13248 "table must be of length 1");
13249 goto err;
13250 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013251 kind = PyUnicode_KIND(key);
13252 data = PyUnicode_DATA(key);
13253 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013254 if (!newkey)
13255 goto err;
13256 res = PyDict_SetItem(new, newkey, value);
13257 Py_DECREF(newkey);
13258 if (res < 0)
13259 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013260 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013261 /* just keep integer keys */
13262 if (PyDict_SetItem(new, key, value) < 0)
13263 goto err;
13264 } else {
13265 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13266 "be strings or integers");
13267 goto err;
13268 }
13269 }
13270 }
13271 return new;
13272 err:
13273 Py_DECREF(new);
13274 return NULL;
13275}
13276
INADA Naoki3ae20562017-01-16 20:41:20 +090013277/*[clinic input]
13278str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013279
INADA Naoki3ae20562017-01-16 20:41:20 +090013280 table: object
13281 Translation table, which must be a mapping of Unicode ordinals to
13282 Unicode ordinals, strings, or None.
13283 /
13284
13285Replace each character in the string using the given translation table.
13286
13287The table must implement lookup/indexing via __getitem__, for instance a
13288dictionary or list. If this operation raises LookupError, the character is
13289left untouched. Characters mapped to None are deleted.
13290[clinic start generated code]*/
13291
13292static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013293unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013294/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013295{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013296 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297}
13298
INADA Naoki3ae20562017-01-16 20:41:20 +090013299/*[clinic input]
13300str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013301
INADA Naoki3ae20562017-01-16 20:41:20 +090013302Return a copy of the string converted to uppercase.
13303[clinic start generated code]*/
13304
13305static PyObject *
13306unicode_upper_impl(PyObject *self)
13307/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013308{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013309 if (PyUnicode_READY(self) == -1)
13310 return NULL;
13311 if (PyUnicode_IS_ASCII(self))
13312 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013313 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013314}
13315
INADA Naoki3ae20562017-01-16 20:41:20 +090013316/*[clinic input]
13317str.zfill as unicode_zfill
13318
13319 width: Py_ssize_t
13320 /
13321
13322Pad a numeric string with zeros on the left, to fill a field of the given width.
13323
13324The string is never truncated.
13325[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013326
13327static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013328unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013329/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013330{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013331 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013332 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013333 int kind;
13334 void *data;
13335 Py_UCS4 chr;
13336
Benjamin Petersonbac79492012-01-14 13:34:47 -050013337 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013338 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013339
Victor Stinnerc4b49542011-12-11 22:44:26 +010013340 if (PyUnicode_GET_LENGTH(self) >= width)
13341 return unicode_result_unchanged(self);
13342
13343 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013344
13345 u = pad(self, fill, 0, '0');
13346
Walter Dörwald068325e2002-04-15 13:36:47 +000013347 if (u == NULL)
13348 return NULL;
13349
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013350 kind = PyUnicode_KIND(u);
13351 data = PyUnicode_DATA(u);
13352 chr = PyUnicode_READ(kind, data, fill);
13353
13354 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013355 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013356 PyUnicode_WRITE(kind, data, 0, chr);
13357 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013358 }
13359
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013360 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013361 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013362}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013363
13364#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013365static PyObject *
13366unicode__decimal2ascii(PyObject *self)
13367{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013368 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013369}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013370#endif
13371
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013372PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013373 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013374\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013375Return True if S starts with the specified prefix, False otherwise.\n\
13376With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013377With optional end, stop comparing S at that position.\n\
13378prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013379
13380static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013381unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013382 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013383{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013384 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013385 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013386 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013387 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013388 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013389
Jesus Ceaac451502011-04-20 17:09:23 +020013390 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013391 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013392 if (PyTuple_Check(subobj)) {
13393 Py_ssize_t i;
13394 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013395 substring = PyTuple_GET_ITEM(subobj, i);
13396 if (!PyUnicode_Check(substring)) {
13397 PyErr_Format(PyExc_TypeError,
13398 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013399 "not %.100s",
13400 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013401 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013402 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013403 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013404 if (result == -1)
13405 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013406 if (result) {
13407 Py_RETURN_TRUE;
13408 }
13409 }
13410 /* nothing matched */
13411 Py_RETURN_FALSE;
13412 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013413 if (!PyUnicode_Check(subobj)) {
13414 PyErr_Format(PyExc_TypeError,
13415 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013416 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013417 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013418 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013419 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013420 if (result == -1)
13421 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013422 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013423}
13424
13425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013426PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013427 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013428\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013429Return True if S ends with the specified suffix, False otherwise.\n\
13430With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013431With optional end, stop comparing S at that position.\n\
13432suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013433
13434static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013435unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013436 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013438 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013439 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013440 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013441 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013442 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013443
Jesus Ceaac451502011-04-20 17:09:23 +020013444 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013445 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013446 if (PyTuple_Check(subobj)) {
13447 Py_ssize_t i;
13448 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013449 substring = PyTuple_GET_ITEM(subobj, i);
13450 if (!PyUnicode_Check(substring)) {
13451 PyErr_Format(PyExc_TypeError,
13452 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013453 "not %.100s",
13454 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013455 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013456 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013457 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013458 if (result == -1)
13459 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013460 if (result) {
13461 Py_RETURN_TRUE;
13462 }
13463 }
13464 Py_RETURN_FALSE;
13465 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013466 if (!PyUnicode_Check(subobj)) {
13467 PyErr_Format(PyExc_TypeError,
13468 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013469 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013470 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013471 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013472 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013473 if (result == -1)
13474 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013475 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013476}
13477
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013478static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013479_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013480{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013481 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13482 writer->data = PyUnicode_DATA(writer->buffer);
13483
13484 if (!writer->readonly) {
13485 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013486 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013487 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013488 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013489 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13490 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13491 writer->kind = PyUnicode_WCHAR_KIND;
13492 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13493
Victor Stinner8f674cc2013-04-17 23:02:17 +020013494 /* Copy-on-write mode: set buffer size to 0 so
13495 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13496 * next write. */
13497 writer->size = 0;
13498 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013499}
13500
Victor Stinnerd3f08822012-05-29 12:57:52 +020013501void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013502_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013503{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013504 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013505
13506 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013507 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013508
13509 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13510 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13511 writer->kind = PyUnicode_WCHAR_KIND;
13512 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013513}
13514
Victor Stinnerd3f08822012-05-29 12:57:52 +020013515int
13516_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13517 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013518{
13519 Py_ssize_t newlen;
13520 PyObject *newbuffer;
13521
Victor Stinner2740e462016-09-06 16:58:36 -070013522 assert(maxchar <= MAX_UNICODE);
13523
Victor Stinnerca9381e2015-09-22 00:58:32 +020013524 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013525 assert((maxchar > writer->maxchar && length >= 0)
13526 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013527
Victor Stinner202fdca2012-05-07 12:47:02 +020013528 if (length > PY_SSIZE_T_MAX - writer->pos) {
13529 PyErr_NoMemory();
13530 return -1;
13531 }
13532 newlen = writer->pos + length;
13533
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013534 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013535
Victor Stinnerd3f08822012-05-29 12:57:52 +020013536 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013537 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013538 if (writer->overallocate
13539 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13540 /* overallocate to limit the number of realloc() */
13541 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013542 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013543 if (newlen < writer->min_length)
13544 newlen = writer->min_length;
13545
Victor Stinnerd3f08822012-05-29 12:57:52 +020013546 writer->buffer = PyUnicode_New(newlen, maxchar);
13547 if (writer->buffer == NULL)
13548 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013549 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013550 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013551 if (writer->overallocate
13552 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13553 /* overallocate to limit the number of realloc() */
13554 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013555 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013556 if (newlen < writer->min_length)
13557 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013558
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013559 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013560 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013561 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013562 newbuffer = PyUnicode_New(newlen, maxchar);
13563 if (newbuffer == NULL)
13564 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013565 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13566 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013567 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013568 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013569 }
13570 else {
13571 newbuffer = resize_compact(writer->buffer, newlen);
13572 if (newbuffer == NULL)
13573 return -1;
13574 }
13575 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013576 }
13577 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013578 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013579 newbuffer = PyUnicode_New(writer->size, maxchar);
13580 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013581 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013582 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13583 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013584 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013585 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013586 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013587 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013588
13589#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013590}
13591
Victor Stinnerca9381e2015-09-22 00:58:32 +020013592int
13593_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13594 enum PyUnicode_Kind kind)
13595{
13596 Py_UCS4 maxchar;
13597
13598 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13599 assert(writer->kind < kind);
13600
13601 switch (kind)
13602 {
13603 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13604 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13605 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13606 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013607 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013608 }
13609
13610 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13611}
13612
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013613static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013614_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013615{
Victor Stinner2740e462016-09-06 16:58:36 -070013616 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013617 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13618 return -1;
13619 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13620 writer->pos++;
13621 return 0;
13622}
13623
13624int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013625_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13626{
13627 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13628}
13629
13630int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013631_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13632{
13633 Py_UCS4 maxchar;
13634 Py_ssize_t len;
13635
13636 if (PyUnicode_READY(str) == -1)
13637 return -1;
13638 len = PyUnicode_GET_LENGTH(str);
13639 if (len == 0)
13640 return 0;
13641 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13642 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013643 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013644 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013645 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013646 Py_INCREF(str);
13647 writer->buffer = str;
13648 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013649 writer->pos += len;
13650 return 0;
13651 }
13652 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13653 return -1;
13654 }
13655 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13656 str, 0, len);
13657 writer->pos += len;
13658 return 0;
13659}
13660
Victor Stinnere215d962012-10-06 23:03:36 +020013661int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013662_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13663 Py_ssize_t start, Py_ssize_t end)
13664{
13665 Py_UCS4 maxchar;
13666 Py_ssize_t len;
13667
13668 if (PyUnicode_READY(str) == -1)
13669 return -1;
13670
13671 assert(0 <= start);
13672 assert(end <= PyUnicode_GET_LENGTH(str));
13673 assert(start <= end);
13674
13675 if (end == 0)
13676 return 0;
13677
13678 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13679 return _PyUnicodeWriter_WriteStr(writer, str);
13680
13681 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13682 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13683 else
13684 maxchar = writer->maxchar;
13685 len = end - start;
13686
13687 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13688 return -1;
13689
13690 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13691 str, start, len);
13692 writer->pos += len;
13693 return 0;
13694}
13695
13696int
Victor Stinner4a587072013-11-19 12:54:53 +010013697_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13698 const char *ascii, Py_ssize_t len)
13699{
13700 if (len == -1)
13701 len = strlen(ascii);
13702
13703 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13704
13705 if (writer->buffer == NULL && !writer->overallocate) {
13706 PyObject *str;
13707
13708 str = _PyUnicode_FromASCII(ascii, len);
13709 if (str == NULL)
13710 return -1;
13711
13712 writer->readonly = 1;
13713 writer->buffer = str;
13714 _PyUnicodeWriter_Update(writer);
13715 writer->pos += len;
13716 return 0;
13717 }
13718
13719 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13720 return -1;
13721
13722 switch (writer->kind)
13723 {
13724 case PyUnicode_1BYTE_KIND:
13725 {
13726 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13727 Py_UCS1 *data = writer->data;
13728
Christian Heimesf051e432016-09-13 20:22:02 +020013729 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013730 break;
13731 }
13732 case PyUnicode_2BYTE_KIND:
13733 {
13734 _PyUnicode_CONVERT_BYTES(
13735 Py_UCS1, Py_UCS2,
13736 ascii, ascii + len,
13737 (Py_UCS2 *)writer->data + writer->pos);
13738 break;
13739 }
13740 case PyUnicode_4BYTE_KIND:
13741 {
13742 _PyUnicode_CONVERT_BYTES(
13743 Py_UCS1, Py_UCS4,
13744 ascii, ascii + len,
13745 (Py_UCS4 *)writer->data + writer->pos);
13746 break;
13747 }
13748 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013749 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013750 }
13751
13752 writer->pos += len;
13753 return 0;
13754}
13755
13756int
13757_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13758 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013759{
13760 Py_UCS4 maxchar;
13761
13762 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13763 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13764 return -1;
13765 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13766 writer->pos += len;
13767 return 0;
13768}
13769
Victor Stinnerd3f08822012-05-29 12:57:52 +020013770PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013771_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013772{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013773 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013774
Victor Stinnerd3f08822012-05-29 12:57:52 +020013775 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013776 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013777 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013778 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013779
13780 str = writer->buffer;
13781 writer->buffer = NULL;
13782
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013783 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013784 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13785 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013786 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013787
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013788 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13789 PyObject *str2;
13790 str2 = resize_compact(str, writer->pos);
13791 if (str2 == NULL) {
13792 Py_DECREF(str);
13793 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013794 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013795 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013796 }
13797
Victor Stinner15a0bd32013-07-08 22:29:55 +020013798 assert(_PyUnicode_CheckConsistency(str, 1));
13799 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013800}
13801
Victor Stinnerd3f08822012-05-29 12:57:52 +020013802void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013803_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013804{
13805 Py_CLEAR(writer->buffer);
13806}
13807
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013808#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013809
13810PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013811 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013812\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013813Return a formatted version of S, using substitutions from args and kwargs.\n\
13814The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013815
Eric Smith27bbca62010-11-04 17:06:58 +000013816PyDoc_STRVAR(format_map__doc__,
13817 "S.format_map(mapping) -> str\n\
13818\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013819Return a formatted version of S, using substitutions from mapping.\n\
13820The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013821
INADA Naoki3ae20562017-01-16 20:41:20 +090013822/*[clinic input]
13823str.__format__ as unicode___format__
13824
13825 format_spec: unicode
13826 /
13827
13828Return a formatted version of the string as described by format_spec.
13829[clinic start generated code]*/
13830
Eric Smith4a7d76d2008-05-30 18:10:19 +000013831static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013832unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013833/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013834{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013835 _PyUnicodeWriter writer;
13836 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013837
Victor Stinnerd3f08822012-05-29 12:57:52 +020013838 if (PyUnicode_READY(self) == -1)
13839 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013840 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013841 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13842 self, format_spec, 0,
13843 PyUnicode_GET_LENGTH(format_spec));
13844 if (ret == -1) {
13845 _PyUnicodeWriter_Dealloc(&writer);
13846 return NULL;
13847 }
13848 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013849}
13850
INADA Naoki3ae20562017-01-16 20:41:20 +090013851/*[clinic input]
13852str.__sizeof__ as unicode_sizeof
13853
13854Return the size of the string in memory, in bytes.
13855[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013856
13857static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013858unicode_sizeof_impl(PyObject *self)
13859/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013860{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013861 Py_ssize_t size;
13862
13863 /* If it's a compact object, account for base structure +
13864 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013865 if (PyUnicode_IS_COMPACT_ASCII(self))
13866 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13867 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013868 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013869 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013870 else {
13871 /* If it is a two-block object, account for base object, and
13872 for character block if present. */
13873 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013874 if (_PyUnicode_DATA_ANY(self))
13875 size += (PyUnicode_GET_LENGTH(self) + 1) *
13876 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013877 }
13878 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013879 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013880 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13881 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13882 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13883 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013884
13885 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013886}
13887
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013888static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013889unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013890{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013891 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013892 if (!copy)
13893 return NULL;
13894 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013895}
13896
Guido van Rossumd57fd912000-03-10 22:53:23 +000013897static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013898 UNICODE_ENCODE_METHODDEF
13899 UNICODE_REPLACE_METHODDEF
13900 UNICODE_SPLIT_METHODDEF
13901 UNICODE_RSPLIT_METHODDEF
13902 UNICODE_JOIN_METHODDEF
13903 UNICODE_CAPITALIZE_METHODDEF
13904 UNICODE_CASEFOLD_METHODDEF
13905 UNICODE_TITLE_METHODDEF
13906 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013907 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013908 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013909 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013910 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013911 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013912 UNICODE_LJUST_METHODDEF
13913 UNICODE_LOWER_METHODDEF
13914 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013915 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13916 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013917 UNICODE_RJUST_METHODDEF
13918 UNICODE_RSTRIP_METHODDEF
13919 UNICODE_RPARTITION_METHODDEF
13920 UNICODE_SPLITLINES_METHODDEF
13921 UNICODE_STRIP_METHODDEF
13922 UNICODE_SWAPCASE_METHODDEF
13923 UNICODE_TRANSLATE_METHODDEF
13924 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013925 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13926 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013927 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013928 UNICODE_ISLOWER_METHODDEF
13929 UNICODE_ISUPPER_METHODDEF
13930 UNICODE_ISTITLE_METHODDEF
13931 UNICODE_ISSPACE_METHODDEF
13932 UNICODE_ISDECIMAL_METHODDEF
13933 UNICODE_ISDIGIT_METHODDEF
13934 UNICODE_ISNUMERIC_METHODDEF
13935 UNICODE_ISALPHA_METHODDEF
13936 UNICODE_ISALNUM_METHODDEF
13937 UNICODE_ISIDENTIFIER_METHODDEF
13938 UNICODE_ISPRINTABLE_METHODDEF
13939 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013940 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013941 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013942 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013943 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013944 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013945#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013946 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013947 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013948#endif
13949
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013950 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013951 {NULL, NULL}
13952};
13953
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013954static PyObject *
13955unicode_mod(PyObject *v, PyObject *w)
13956{
Brian Curtindfc80e32011-08-10 20:28:54 -050013957 if (!PyUnicode_Check(v))
13958 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013959 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013960}
13961
13962static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013963 0, /*nb_add*/
13964 0, /*nb_subtract*/
13965 0, /*nb_multiply*/
13966 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013967};
13968
Guido van Rossumd57fd912000-03-10 22:53:23 +000013969static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013970 (lenfunc) unicode_length, /* sq_length */
13971 PyUnicode_Concat, /* sq_concat */
13972 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13973 (ssizeargfunc) unicode_getitem, /* sq_item */
13974 0, /* sq_slice */
13975 0, /* sq_ass_item */
13976 0, /* sq_ass_slice */
13977 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013978};
13979
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013980static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013981unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013982{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013983 if (PyUnicode_READY(self) == -1)
13984 return NULL;
13985
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013986 if (PyIndex_Check(item)) {
13987 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013988 if (i == -1 && PyErr_Occurred())
13989 return NULL;
13990 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013991 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013992 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013993 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060013994 Py_ssize_t start, stop, step, slicelength, i;
13995 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013996 PyObject *result;
13997 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020013998 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020013999 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014000
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014001 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014002 return NULL;
14003 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014004 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14005 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014006
14007 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014008 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014009 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014010 slicelength == PyUnicode_GET_LENGTH(self)) {
14011 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014012 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014013 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014014 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014015 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014016 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014017 src_kind = PyUnicode_KIND(self);
14018 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014019 if (!PyUnicode_IS_ASCII(self)) {
14020 kind_limit = kind_maxchar_limit(src_kind);
14021 max_char = 0;
14022 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14023 ch = PyUnicode_READ(src_kind, src_data, cur);
14024 if (ch > max_char) {
14025 max_char = ch;
14026 if (max_char >= kind_limit)
14027 break;
14028 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014029 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014030 }
Victor Stinner55c99112011-10-13 01:17:06 +020014031 else
14032 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014033 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014034 if (result == NULL)
14035 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014036 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014037 dest_data = PyUnicode_DATA(result);
14038
14039 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014040 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14041 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014042 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014043 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014044 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014045 } else {
14046 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14047 return NULL;
14048 }
14049}
14050
14051static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014052 (lenfunc)unicode_length, /* mp_length */
14053 (binaryfunc)unicode_subscript, /* mp_subscript */
14054 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014055};
14056
Guido van Rossumd57fd912000-03-10 22:53:23 +000014057
Guido van Rossumd57fd912000-03-10 22:53:23 +000014058/* Helpers for PyUnicode_Format() */
14059
Victor Stinnera47082312012-10-04 02:19:54 +020014060struct unicode_formatter_t {
14061 PyObject *args;
14062 int args_owned;
14063 Py_ssize_t arglen, argidx;
14064 PyObject *dict;
14065
14066 enum PyUnicode_Kind fmtkind;
14067 Py_ssize_t fmtcnt, fmtpos;
14068 void *fmtdata;
14069 PyObject *fmtstr;
14070
14071 _PyUnicodeWriter writer;
14072};
14073
14074struct unicode_format_arg_t {
14075 Py_UCS4 ch;
14076 int flags;
14077 Py_ssize_t width;
14078 int prec;
14079 int sign;
14080};
14081
Guido van Rossumd57fd912000-03-10 22:53:23 +000014082static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014083unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014084{
Victor Stinnera47082312012-10-04 02:19:54 +020014085 Py_ssize_t argidx = ctx->argidx;
14086
14087 if (argidx < ctx->arglen) {
14088 ctx->argidx++;
14089 if (ctx->arglen < 0)
14090 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014091 else
Victor Stinnera47082312012-10-04 02:19:54 +020014092 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014093 }
14094 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014095 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014096 return NULL;
14097}
14098
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014099/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014100
Victor Stinnera47082312012-10-04 02:19:54 +020014101/* Format a float into the writer if the writer is not NULL, or into *p_output
14102 otherwise.
14103
14104 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014105static int
Victor Stinnera47082312012-10-04 02:19:54 +020014106formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14107 PyObject **p_output,
14108 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014109{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014110 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014111 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014112 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014113 int prec;
14114 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014115
Guido van Rossumd57fd912000-03-10 22:53:23 +000014116 x = PyFloat_AsDouble(v);
14117 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014118 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014119
Victor Stinnera47082312012-10-04 02:19:54 +020014120 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014121 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014122 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014123
Victor Stinnera47082312012-10-04 02:19:54 +020014124 if (arg->flags & F_ALT)
14125 dtoa_flags = Py_DTSF_ALT;
14126 else
14127 dtoa_flags = 0;
14128 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014129 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014130 return -1;
14131 len = strlen(p);
14132 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014133 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014134 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014135 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014136 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014137 }
14138 else
14139 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014140 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014141 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014142}
14143
Victor Stinnerd0880d52012-04-27 23:40:13 +020014144/* formatlong() emulates the format codes d, u, o, x and X, and
14145 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14146 * Python's regular ints.
14147 * Return value: a new PyUnicodeObject*, or NULL if error.
14148 * The output string is of the form
14149 * "-"? ("0x" | "0X")? digit+
14150 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14151 * set in flags. The case of hex digits will be correct,
14152 * There will be at least prec digits, zero-filled on the left if
14153 * necessary to get that many.
14154 * val object to be converted
14155 * flags bitmask of format flags; only F_ALT is looked at
14156 * prec minimum number of digits; 0-fill on left if needed
14157 * type a character in [duoxX]; u acts the same as d
14158 *
14159 * CAUTION: o, x and X conversions on regular ints can never
14160 * produce a '-' sign, but can for Python's unbounded ints.
14161 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014162PyObject *
14163_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014164{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014165 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014166 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014167 Py_ssize_t i;
14168 int sign; /* 1 if '-', else 0 */
14169 int len; /* number of characters */
14170 Py_ssize_t llen;
14171 int numdigits; /* len == numnondigits + numdigits */
14172 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014173
Victor Stinnerd0880d52012-04-27 23:40:13 +020014174 /* Avoid exceeding SSIZE_T_MAX */
14175 if (prec > INT_MAX-3) {
14176 PyErr_SetString(PyExc_OverflowError,
14177 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014178 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014179 }
14180
14181 assert(PyLong_Check(val));
14182
14183 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014184 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014185 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014186 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014187 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014188 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014189 /* int and int subclasses should print numerically when a numeric */
14190 /* format code is used (see issue18780) */
14191 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014192 break;
14193 case 'o':
14194 numnondigits = 2;
14195 result = PyNumber_ToBase(val, 8);
14196 break;
14197 case 'x':
14198 case 'X':
14199 numnondigits = 2;
14200 result = PyNumber_ToBase(val, 16);
14201 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014202 }
14203 if (!result)
14204 return NULL;
14205
14206 assert(unicode_modifiable(result));
14207 assert(PyUnicode_IS_READY(result));
14208 assert(PyUnicode_IS_ASCII(result));
14209
14210 /* To modify the string in-place, there can only be one reference. */
14211 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014212 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014213 PyErr_BadInternalCall();
14214 return NULL;
14215 }
14216 buf = PyUnicode_DATA(result);
14217 llen = PyUnicode_GET_LENGTH(result);
14218 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014219 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014220 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014221 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014222 return NULL;
14223 }
14224 len = (int)llen;
14225 sign = buf[0] == '-';
14226 numnondigits += sign;
14227 numdigits = len - numnondigits;
14228 assert(numdigits > 0);
14229
14230 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014231 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014232 (type == 'o' || type == 'x' || type == 'X'))) {
14233 assert(buf[sign] == '0');
14234 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14235 buf[sign+1] == 'o');
14236 numnondigits -= 2;
14237 buf += 2;
14238 len -= 2;
14239 if (sign)
14240 buf[0] = '-';
14241 assert(len == numnondigits + numdigits);
14242 assert(numdigits > 0);
14243 }
14244
14245 /* Fill with leading zeroes to meet minimum width. */
14246 if (prec > numdigits) {
14247 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14248 numnondigits + prec);
14249 char *b1;
14250 if (!r1) {
14251 Py_DECREF(result);
14252 return NULL;
14253 }
14254 b1 = PyBytes_AS_STRING(r1);
14255 for (i = 0; i < numnondigits; ++i)
14256 *b1++ = *buf++;
14257 for (i = 0; i < prec - numdigits; i++)
14258 *b1++ = '0';
14259 for (i = 0; i < numdigits; i++)
14260 *b1++ = *buf++;
14261 *b1 = '\0';
14262 Py_DECREF(result);
14263 result = r1;
14264 buf = PyBytes_AS_STRING(result);
14265 len = numnondigits + prec;
14266 }
14267
14268 /* Fix up case for hex conversions. */
14269 if (type == 'X') {
14270 /* Need to convert all lower case letters to upper case.
14271 and need to convert 0x to 0X (and -0x to -0X). */
14272 for (i = 0; i < len; i++)
14273 if (buf[i] >= 'a' && buf[i] <= 'x')
14274 buf[i] -= 'a'-'A';
14275 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014276 if (!PyUnicode_Check(result)
14277 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014278 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014279 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014280 Py_DECREF(result);
14281 result = unicode;
14282 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014283 else if (len != PyUnicode_GET_LENGTH(result)) {
14284 if (PyUnicode_Resize(&result, len) < 0)
14285 Py_CLEAR(result);
14286 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014287 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014288}
14289
Ethan Furmandf3ed242014-01-05 06:50:30 -080014290/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014291 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014292 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014293 * -1 and raise an exception on error */
14294static int
Victor Stinnera47082312012-10-04 02:19:54 +020014295mainformatlong(PyObject *v,
14296 struct unicode_format_arg_t *arg,
14297 PyObject **p_output,
14298 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014299{
14300 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014301 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014302
14303 if (!PyNumber_Check(v))
14304 goto wrongtype;
14305
Ethan Furman9ab74802014-03-21 06:38:46 -070014306 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014307 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014308 if (type == 'o' || type == 'x' || type == 'X') {
14309 iobj = PyNumber_Index(v);
14310 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014311 if (PyErr_ExceptionMatches(PyExc_TypeError))
14312 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014313 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014314 }
14315 }
14316 else {
14317 iobj = PyNumber_Long(v);
14318 if (iobj == NULL ) {
14319 if (PyErr_ExceptionMatches(PyExc_TypeError))
14320 goto wrongtype;
14321 return -1;
14322 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014323 }
14324 assert(PyLong_Check(iobj));
14325 }
14326 else {
14327 iobj = v;
14328 Py_INCREF(iobj);
14329 }
14330
14331 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014332 && arg->width == -1 && arg->prec == -1
14333 && !(arg->flags & (F_SIGN | F_BLANK))
14334 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014335 {
14336 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014337 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014338 int base;
14339
Victor Stinnera47082312012-10-04 02:19:54 +020014340 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014341 {
14342 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014343 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014344 case 'd':
14345 case 'i':
14346 case 'u':
14347 base = 10;
14348 break;
14349 case 'o':
14350 base = 8;
14351 break;
14352 case 'x':
14353 case 'X':
14354 base = 16;
14355 break;
14356 }
14357
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014358 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14359 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014360 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014361 }
14362 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014363 return 1;
14364 }
14365
Ethan Furmanb95b5612015-01-23 20:05:18 -080014366 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014367 Py_DECREF(iobj);
14368 if (res == NULL)
14369 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014370 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014371 return 0;
14372
14373wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014374 switch(type)
14375 {
14376 case 'o':
14377 case 'x':
14378 case 'X':
14379 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014380 "%%%c format: an integer is required, "
14381 "not %.200s",
14382 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014383 break;
14384 default:
14385 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014386 "%%%c format: a number is required, "
14387 "not %.200s",
14388 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014389 break;
14390 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014391 return -1;
14392}
14393
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014394static Py_UCS4
14395formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014396{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014397 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014398 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014399 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014400 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014401 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014402 goto onError;
14403 }
14404 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014405 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014406 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014407 /* make sure number is a type of integer */
14408 if (!PyLong_Check(v)) {
14409 iobj = PyNumber_Index(v);
14410 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014411 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014412 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014413 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014414 Py_DECREF(iobj);
14415 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014416 else {
14417 x = PyLong_AsLong(v);
14418 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014419 if (x == -1 && PyErr_Occurred())
14420 goto onError;
14421
Victor Stinner8faf8212011-12-08 22:14:11 +010014422 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014423 PyErr_SetString(PyExc_OverflowError,
14424 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014425 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014426 }
14427
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014428 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014429 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014430
Benjamin Peterson29060642009-01-31 22:14:21 +000014431 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014432 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014433 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014434 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014435}
14436
Victor Stinnera47082312012-10-04 02:19:54 +020014437/* Parse options of an argument: flags, width, precision.
14438 Handle also "%(name)" syntax.
14439
14440 Return 0 if the argument has been formatted into arg->str.
14441 Return 1 if the argument has been written into ctx->writer,
14442 Raise an exception and return -1 on error. */
14443static int
14444unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14445 struct unicode_format_arg_t *arg)
14446{
14447#define FORMAT_READ(ctx) \
14448 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14449
14450 PyObject *v;
14451
Victor Stinnera47082312012-10-04 02:19:54 +020014452 if (arg->ch == '(') {
14453 /* Get argument value from a dictionary. Example: "%(name)s". */
14454 Py_ssize_t keystart;
14455 Py_ssize_t keylen;
14456 PyObject *key;
14457 int pcount = 1;
14458
14459 if (ctx->dict == NULL) {
14460 PyErr_SetString(PyExc_TypeError,
14461 "format requires a mapping");
14462 return -1;
14463 }
14464 ++ctx->fmtpos;
14465 --ctx->fmtcnt;
14466 keystart = ctx->fmtpos;
14467 /* Skip over balanced parentheses */
14468 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14469 arg->ch = FORMAT_READ(ctx);
14470 if (arg->ch == ')')
14471 --pcount;
14472 else if (arg->ch == '(')
14473 ++pcount;
14474 ctx->fmtpos++;
14475 }
14476 keylen = ctx->fmtpos - keystart - 1;
14477 if (ctx->fmtcnt < 0 || pcount > 0) {
14478 PyErr_SetString(PyExc_ValueError,
14479 "incomplete format key");
14480 return -1;
14481 }
14482 key = PyUnicode_Substring(ctx->fmtstr,
14483 keystart, keystart + keylen);
14484 if (key == NULL)
14485 return -1;
14486 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014487 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014488 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014489 }
14490 ctx->args = PyObject_GetItem(ctx->dict, key);
14491 Py_DECREF(key);
14492 if (ctx->args == NULL)
14493 return -1;
14494 ctx->args_owned = 1;
14495 ctx->arglen = -1;
14496 ctx->argidx = -2;
14497 }
14498
14499 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014500 while (--ctx->fmtcnt >= 0) {
14501 arg->ch = FORMAT_READ(ctx);
14502 ctx->fmtpos++;
14503 switch (arg->ch) {
14504 case '-': arg->flags |= F_LJUST; continue;
14505 case '+': arg->flags |= F_SIGN; continue;
14506 case ' ': arg->flags |= F_BLANK; continue;
14507 case '#': arg->flags |= F_ALT; continue;
14508 case '0': arg->flags |= F_ZERO; continue;
14509 }
14510 break;
14511 }
14512
14513 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014514 if (arg->ch == '*') {
14515 v = unicode_format_getnextarg(ctx);
14516 if (v == NULL)
14517 return -1;
14518 if (!PyLong_Check(v)) {
14519 PyErr_SetString(PyExc_TypeError,
14520 "* wants int");
14521 return -1;
14522 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014523 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014524 if (arg->width == -1 && PyErr_Occurred())
14525 return -1;
14526 if (arg->width < 0) {
14527 arg->flags |= F_LJUST;
14528 arg->width = -arg->width;
14529 }
14530 if (--ctx->fmtcnt >= 0) {
14531 arg->ch = FORMAT_READ(ctx);
14532 ctx->fmtpos++;
14533 }
14534 }
14535 else if (arg->ch >= '0' && arg->ch <= '9') {
14536 arg->width = arg->ch - '0';
14537 while (--ctx->fmtcnt >= 0) {
14538 arg->ch = FORMAT_READ(ctx);
14539 ctx->fmtpos++;
14540 if (arg->ch < '0' || arg->ch > '9')
14541 break;
14542 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14543 mixing signed and unsigned comparison. Since arg->ch is between
14544 '0' and '9', casting to int is safe. */
14545 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14546 PyErr_SetString(PyExc_ValueError,
14547 "width too big");
14548 return -1;
14549 }
14550 arg->width = arg->width*10 + (arg->ch - '0');
14551 }
14552 }
14553
14554 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014555 if (arg->ch == '.') {
14556 arg->prec = 0;
14557 if (--ctx->fmtcnt >= 0) {
14558 arg->ch = FORMAT_READ(ctx);
14559 ctx->fmtpos++;
14560 }
14561 if (arg->ch == '*') {
14562 v = unicode_format_getnextarg(ctx);
14563 if (v == NULL)
14564 return -1;
14565 if (!PyLong_Check(v)) {
14566 PyErr_SetString(PyExc_TypeError,
14567 "* wants int");
14568 return -1;
14569 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014570 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014571 if (arg->prec == -1 && PyErr_Occurred())
14572 return -1;
14573 if (arg->prec < 0)
14574 arg->prec = 0;
14575 if (--ctx->fmtcnt >= 0) {
14576 arg->ch = FORMAT_READ(ctx);
14577 ctx->fmtpos++;
14578 }
14579 }
14580 else if (arg->ch >= '0' && arg->ch <= '9') {
14581 arg->prec = arg->ch - '0';
14582 while (--ctx->fmtcnt >= 0) {
14583 arg->ch = FORMAT_READ(ctx);
14584 ctx->fmtpos++;
14585 if (arg->ch < '0' || arg->ch > '9')
14586 break;
14587 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14588 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014589 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014590 return -1;
14591 }
14592 arg->prec = arg->prec*10 + (arg->ch - '0');
14593 }
14594 }
14595 }
14596
14597 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14598 if (ctx->fmtcnt >= 0) {
14599 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14600 if (--ctx->fmtcnt >= 0) {
14601 arg->ch = FORMAT_READ(ctx);
14602 ctx->fmtpos++;
14603 }
14604 }
14605 }
14606 if (ctx->fmtcnt < 0) {
14607 PyErr_SetString(PyExc_ValueError,
14608 "incomplete format");
14609 return -1;
14610 }
14611 return 0;
14612
14613#undef FORMAT_READ
14614}
14615
14616/* Format one argument. Supported conversion specifiers:
14617
14618 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014619 - "i", "d", "u": int or float
14620 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014621 - "e", "E", "f", "F", "g", "G": float
14622 - "c": int or str (1 character)
14623
Victor Stinner8dbd4212012-12-04 09:30:24 +010014624 When possible, the output is written directly into the Unicode writer
14625 (ctx->writer). A string is created when padding is required.
14626
Victor Stinnera47082312012-10-04 02:19:54 +020014627 Return 0 if the argument has been formatted into *p_str,
14628 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014629 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014630static int
14631unicode_format_arg_format(struct unicode_formatter_t *ctx,
14632 struct unicode_format_arg_t *arg,
14633 PyObject **p_str)
14634{
14635 PyObject *v;
14636 _PyUnicodeWriter *writer = &ctx->writer;
14637
14638 if (ctx->fmtcnt == 0)
14639 ctx->writer.overallocate = 0;
14640
Victor Stinnera47082312012-10-04 02:19:54 +020014641 v = unicode_format_getnextarg(ctx);
14642 if (v == NULL)
14643 return -1;
14644
Victor Stinnera47082312012-10-04 02:19:54 +020014645
14646 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014647 case 's':
14648 case 'r':
14649 case 'a':
14650 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14651 /* Fast path */
14652 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14653 return -1;
14654 return 1;
14655 }
14656
14657 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14658 *p_str = v;
14659 Py_INCREF(*p_str);
14660 }
14661 else {
14662 if (arg->ch == 's')
14663 *p_str = PyObject_Str(v);
14664 else if (arg->ch == 'r')
14665 *p_str = PyObject_Repr(v);
14666 else
14667 *p_str = PyObject_ASCII(v);
14668 }
14669 break;
14670
14671 case 'i':
14672 case 'd':
14673 case 'u':
14674 case 'o':
14675 case 'x':
14676 case 'X':
14677 {
14678 int ret = mainformatlong(v, arg, p_str, writer);
14679 if (ret != 0)
14680 return ret;
14681 arg->sign = 1;
14682 break;
14683 }
14684
14685 case 'e':
14686 case 'E':
14687 case 'f':
14688 case 'F':
14689 case 'g':
14690 case 'G':
14691 if (arg->width == -1 && arg->prec == -1
14692 && !(arg->flags & (F_SIGN | F_BLANK)))
14693 {
14694 /* Fast path */
14695 if (formatfloat(v, arg, NULL, writer) == -1)
14696 return -1;
14697 return 1;
14698 }
14699
14700 arg->sign = 1;
14701 if (formatfloat(v, arg, p_str, NULL) == -1)
14702 return -1;
14703 break;
14704
14705 case 'c':
14706 {
14707 Py_UCS4 ch = formatchar(v);
14708 if (ch == (Py_UCS4) -1)
14709 return -1;
14710 if (arg->width == -1 && arg->prec == -1) {
14711 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014712 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014713 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014714 return 1;
14715 }
14716 *p_str = PyUnicode_FromOrdinal(ch);
14717 break;
14718 }
14719
14720 default:
14721 PyErr_Format(PyExc_ValueError,
14722 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014723 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014724 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14725 (int)arg->ch,
14726 ctx->fmtpos - 1);
14727 return -1;
14728 }
14729 if (*p_str == NULL)
14730 return -1;
14731 assert (PyUnicode_Check(*p_str));
14732 return 0;
14733}
14734
14735static int
14736unicode_format_arg_output(struct unicode_formatter_t *ctx,
14737 struct unicode_format_arg_t *arg,
14738 PyObject *str)
14739{
14740 Py_ssize_t len;
14741 enum PyUnicode_Kind kind;
14742 void *pbuf;
14743 Py_ssize_t pindex;
14744 Py_UCS4 signchar;
14745 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014746 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014747 Py_ssize_t sublen;
14748 _PyUnicodeWriter *writer = &ctx->writer;
14749 Py_UCS4 fill;
14750
14751 fill = ' ';
14752 if (arg->sign && arg->flags & F_ZERO)
14753 fill = '0';
14754
14755 if (PyUnicode_READY(str) == -1)
14756 return -1;
14757
14758 len = PyUnicode_GET_LENGTH(str);
14759 if ((arg->width == -1 || arg->width <= len)
14760 && (arg->prec == -1 || arg->prec >= len)
14761 && !(arg->flags & (F_SIGN | F_BLANK)))
14762 {
14763 /* Fast path */
14764 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14765 return -1;
14766 return 0;
14767 }
14768
14769 /* Truncate the string for "s", "r" and "a" formats
14770 if the precision is set */
14771 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14772 if (arg->prec >= 0 && len > arg->prec)
14773 len = arg->prec;
14774 }
14775
14776 /* Adjust sign and width */
14777 kind = PyUnicode_KIND(str);
14778 pbuf = PyUnicode_DATA(str);
14779 pindex = 0;
14780 signchar = '\0';
14781 if (arg->sign) {
14782 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14783 if (ch == '-' || ch == '+') {
14784 signchar = ch;
14785 len--;
14786 pindex++;
14787 }
14788 else if (arg->flags & F_SIGN)
14789 signchar = '+';
14790 else if (arg->flags & F_BLANK)
14791 signchar = ' ';
14792 else
14793 arg->sign = 0;
14794 }
14795 if (arg->width < len)
14796 arg->width = len;
14797
14798 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014799 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014800 if (!(arg->flags & F_LJUST)) {
14801 if (arg->sign) {
14802 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014803 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014804 }
14805 else {
14806 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014807 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014808 }
14809 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014810 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14811 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014812 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014813 }
14814
Victor Stinnera47082312012-10-04 02:19:54 +020014815 buflen = arg->width;
14816 if (arg->sign && len == arg->width)
14817 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014818 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014819 return -1;
14820
14821 /* Write the sign if needed */
14822 if (arg->sign) {
14823 if (fill != ' ') {
14824 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14825 writer->pos += 1;
14826 }
14827 if (arg->width > len)
14828 arg->width--;
14829 }
14830
14831 /* Write the numeric prefix for "x", "X" and "o" formats
14832 if the alternate form is used.
14833 For example, write "0x" for the "%#x" format. */
14834 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14835 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14836 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14837 if (fill != ' ') {
14838 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14839 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14840 writer->pos += 2;
14841 pindex += 2;
14842 }
14843 arg->width -= 2;
14844 if (arg->width < 0)
14845 arg->width = 0;
14846 len -= 2;
14847 }
14848
14849 /* Pad left with the fill character if needed */
14850 if (arg->width > len && !(arg->flags & F_LJUST)) {
14851 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014852 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014853 writer->pos += sublen;
14854 arg->width = len;
14855 }
14856
14857 /* If padding with spaces: write sign if needed and/or numeric prefix if
14858 the alternate form is used */
14859 if (fill == ' ') {
14860 if (arg->sign) {
14861 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14862 writer->pos += 1;
14863 }
14864 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14865 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14866 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14867 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14868 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14869 writer->pos += 2;
14870 pindex += 2;
14871 }
14872 }
14873
14874 /* Write characters */
14875 if (len) {
14876 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14877 str, pindex, len);
14878 writer->pos += len;
14879 }
14880
14881 /* Pad right with the fill character if needed */
14882 if (arg->width > len) {
14883 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014884 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014885 writer->pos += sublen;
14886 }
14887 return 0;
14888}
14889
14890/* Helper of PyUnicode_Format(): format one arg.
14891 Return 0 on success, raise an exception and return -1 on error. */
14892static int
14893unicode_format_arg(struct unicode_formatter_t *ctx)
14894{
14895 struct unicode_format_arg_t arg;
14896 PyObject *str;
14897 int ret;
14898
Victor Stinner8dbd4212012-12-04 09:30:24 +010014899 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014900 if (arg.ch == '%') {
14901 ctx->fmtpos++;
14902 ctx->fmtcnt--;
14903 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14904 return -1;
14905 return 0;
14906 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014907 arg.flags = 0;
14908 arg.width = -1;
14909 arg.prec = -1;
14910 arg.sign = 0;
14911 str = NULL;
14912
Victor Stinnera47082312012-10-04 02:19:54 +020014913 ret = unicode_format_arg_parse(ctx, &arg);
14914 if (ret == -1)
14915 return -1;
14916
14917 ret = unicode_format_arg_format(ctx, &arg, &str);
14918 if (ret == -1)
14919 return -1;
14920
14921 if (ret != 1) {
14922 ret = unicode_format_arg_output(ctx, &arg, str);
14923 Py_DECREF(str);
14924 if (ret == -1)
14925 return -1;
14926 }
14927
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014928 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014929 PyErr_SetString(PyExc_TypeError,
14930 "not all arguments converted during string formatting");
14931 return -1;
14932 }
14933 return 0;
14934}
14935
Alexander Belopolsky40018472011-02-26 01:02:56 +000014936PyObject *
14937PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014938{
Victor Stinnera47082312012-10-04 02:19:54 +020014939 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014940
Guido van Rossumd57fd912000-03-10 22:53:23 +000014941 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014942 PyErr_BadInternalCall();
14943 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014944 }
Victor Stinnera47082312012-10-04 02:19:54 +020014945
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014946 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014947 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014948
14949 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014950 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14951 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14952 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14953 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014954
Victor Stinner8f674cc2013-04-17 23:02:17 +020014955 _PyUnicodeWriter_Init(&ctx.writer);
14956 ctx.writer.min_length = ctx.fmtcnt + 100;
14957 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014958
Guido van Rossumd57fd912000-03-10 22:53:23 +000014959 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014960 ctx.arglen = PyTuple_Size(args);
14961 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014962 }
14963 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014964 ctx.arglen = -1;
14965 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014966 }
Victor Stinnera47082312012-10-04 02:19:54 +020014967 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014968 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014969 ctx.dict = args;
14970 else
14971 ctx.dict = NULL;
14972 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014973
Victor Stinnera47082312012-10-04 02:19:54 +020014974 while (--ctx.fmtcnt >= 0) {
14975 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014976 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014977
14978 nonfmtpos = ctx.fmtpos++;
14979 while (ctx.fmtcnt >= 0 &&
14980 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14981 ctx.fmtpos++;
14982 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014983 }
Victor Stinnera47082312012-10-04 02:19:54 +020014984 if (ctx.fmtcnt < 0) {
14985 ctx.fmtpos--;
14986 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014987 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014988
Victor Stinnercfc4c132013-04-03 01:48:39 +020014989 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14990 nonfmtpos, ctx.fmtpos) < 0)
14991 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014992 }
14993 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014994 ctx.fmtpos++;
14995 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014996 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014997 }
14998 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020014999
Victor Stinnera47082312012-10-04 02:19:54 +020015000 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015001 PyErr_SetString(PyExc_TypeError,
15002 "not all arguments converted during string formatting");
15003 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015004 }
15005
Victor Stinnera47082312012-10-04 02:19:54 +020015006 if (ctx.args_owned) {
15007 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015008 }
Victor Stinnera47082312012-10-04 02:19:54 +020015009 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015010
Benjamin Peterson29060642009-01-31 22:14:21 +000015011 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015012 _PyUnicodeWriter_Dealloc(&ctx.writer);
15013 if (ctx.args_owned) {
15014 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015015 }
15016 return NULL;
15017}
15018
Jeremy Hylton938ace62002-07-17 16:30:39 +000015019static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015020unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15021
Tim Peters6d6c1a32001-08-02 04:15:00 +000015022static PyObject *
15023unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15024{
Benjamin Peterson29060642009-01-31 22:14:21 +000015025 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015026 static char *kwlist[] = {"object", "encoding", "errors", 0};
15027 char *encoding = NULL;
15028 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015029
Benjamin Peterson14339b62009-01-31 16:36:08 +000015030 if (type != &PyUnicode_Type)
15031 return unicode_subtype_new(type, args, kwds);
15032 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015033 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015034 return NULL;
15035 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015036 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015037 if (encoding == NULL && errors == NULL)
15038 return PyObject_Str(x);
15039 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015040 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015041}
15042
Guido van Rossume023fe02001-08-30 03:12:59 +000015043static PyObject *
15044unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15045{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015046 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015047 Py_ssize_t length, char_size;
15048 int share_wstr, share_utf8;
15049 unsigned int kind;
15050 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015051
Benjamin Peterson14339b62009-01-31 16:36:08 +000015052 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015053
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015054 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015056 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015057 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015058 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015059 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015060 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015061 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015063 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015064 if (self == NULL) {
15065 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015066 return NULL;
15067 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015068 kind = PyUnicode_KIND(unicode);
15069 length = PyUnicode_GET_LENGTH(unicode);
15070
15071 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015072#ifdef Py_DEBUG
15073 _PyUnicode_HASH(self) = -1;
15074#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015075 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015076#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015077 _PyUnicode_STATE(self).interned = 0;
15078 _PyUnicode_STATE(self).kind = kind;
15079 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015080 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015081 _PyUnicode_STATE(self).ready = 1;
15082 _PyUnicode_WSTR(self) = NULL;
15083 _PyUnicode_UTF8_LENGTH(self) = 0;
15084 _PyUnicode_UTF8(self) = NULL;
15085 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015086 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015087
15088 share_utf8 = 0;
15089 share_wstr = 0;
15090 if (kind == PyUnicode_1BYTE_KIND) {
15091 char_size = 1;
15092 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15093 share_utf8 = 1;
15094 }
15095 else if (kind == PyUnicode_2BYTE_KIND) {
15096 char_size = 2;
15097 if (sizeof(wchar_t) == 2)
15098 share_wstr = 1;
15099 }
15100 else {
15101 assert(kind == PyUnicode_4BYTE_KIND);
15102 char_size = 4;
15103 if (sizeof(wchar_t) == 4)
15104 share_wstr = 1;
15105 }
15106
15107 /* Ensure we won't overflow the length. */
15108 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15109 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015110 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015111 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015112 data = PyObject_MALLOC((length + 1) * char_size);
15113 if (data == NULL) {
15114 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015115 goto onError;
15116 }
15117
Victor Stinnerc3c74152011-10-02 20:39:55 +020015118 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015119 if (share_utf8) {
15120 _PyUnicode_UTF8_LENGTH(self) = length;
15121 _PyUnicode_UTF8(self) = data;
15122 }
15123 if (share_wstr) {
15124 _PyUnicode_WSTR_LENGTH(self) = length;
15125 _PyUnicode_WSTR(self) = (wchar_t *)data;
15126 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015127
Christian Heimesf051e432016-09-13 20:22:02 +020015128 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015129 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015130 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015131#ifdef Py_DEBUG
15132 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15133#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015134 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015135 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015136
15137onError:
15138 Py_DECREF(unicode);
15139 Py_DECREF(self);
15140 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015141}
15142
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015143PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015144"str(object='') -> str\n\
15145str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015146\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015147Create a new string object from the given object. If encoding or\n\
15148errors is specified, then the object must expose a data buffer\n\
15149that will be decoded using the given encoding and error handler.\n\
15150Otherwise, returns the result of object.__str__() (if defined)\n\
15151or repr(object).\n\
15152encoding defaults to sys.getdefaultencoding().\n\
15153errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015154
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015155static PyObject *unicode_iter(PyObject *seq);
15156
Guido van Rossumd57fd912000-03-10 22:53:23 +000015157PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015158 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015159 "str", /* tp_name */
15160 sizeof(PyUnicodeObject), /* tp_basicsize */
15161 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015162 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015163 (destructor)unicode_dealloc, /* tp_dealloc */
15164 0, /* tp_print */
15165 0, /* tp_getattr */
15166 0, /* tp_setattr */
15167 0, /* tp_reserved */
15168 unicode_repr, /* tp_repr */
15169 &unicode_as_number, /* tp_as_number */
15170 &unicode_as_sequence, /* tp_as_sequence */
15171 &unicode_as_mapping, /* tp_as_mapping */
15172 (hashfunc) unicode_hash, /* tp_hash*/
15173 0, /* tp_call*/
15174 (reprfunc) unicode_str, /* tp_str */
15175 PyObject_GenericGetAttr, /* tp_getattro */
15176 0, /* tp_setattro */
15177 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015178 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015179 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15180 unicode_doc, /* tp_doc */
15181 0, /* tp_traverse */
15182 0, /* tp_clear */
15183 PyUnicode_RichCompare, /* tp_richcompare */
15184 0, /* tp_weaklistoffset */
15185 unicode_iter, /* tp_iter */
15186 0, /* tp_iternext */
15187 unicode_methods, /* tp_methods */
15188 0, /* tp_members */
15189 0, /* tp_getset */
15190 &PyBaseObject_Type, /* tp_base */
15191 0, /* tp_dict */
15192 0, /* tp_descr_get */
15193 0, /* tp_descr_set */
15194 0, /* tp_dictoffset */
15195 0, /* tp_init */
15196 0, /* tp_alloc */
15197 unicode_new, /* tp_new */
15198 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015199};
15200
15201/* Initialize the Unicode implementation */
15202
Victor Stinner331a6a52019-05-27 16:39:22 +020015203PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015204_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015205{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015206 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015207 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015208 0x000A, /* LINE FEED */
15209 0x000D, /* CARRIAGE RETURN */
15210 0x001C, /* FILE SEPARATOR */
15211 0x001D, /* GROUP SEPARATOR */
15212 0x001E, /* RECORD SEPARATOR */
15213 0x0085, /* NEXT LINE */
15214 0x2028, /* LINE SEPARATOR */
15215 0x2029, /* PARAGRAPH SEPARATOR */
15216 };
15217
Fred Drakee4315f52000-05-09 19:53:39 +000015218 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015219 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015220 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015221 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015222 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015223 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015224
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015225 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015226 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015227 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015228
15229 /* initialize the linebreak bloom filter */
15230 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015231 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015232 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015233
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015234 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015235 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015236 }
15237 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015238 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015239 }
15240 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015241 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015242 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015243 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015244}
15245
15246/* Finalize the Unicode implementation */
15247
Christian Heimesa156e092008-02-16 07:38:31 +000015248int
15249PyUnicode_ClearFreeList(void)
15250{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015251 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015252}
15253
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015254
Walter Dörwald16807132007-05-25 13:52:07 +000015255void
15256PyUnicode_InternInPlace(PyObject **p)
15257{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015258 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015259 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015260#ifdef Py_DEBUG
15261 assert(s != NULL);
15262 assert(_PyUnicode_CHECK(s));
15263#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015264 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015265 return;
15266#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015267 /* If it's a subclass, we don't really know what putting
15268 it in the interned dict might do. */
15269 if (!PyUnicode_CheckExact(s))
15270 return;
15271 if (PyUnicode_CHECK_INTERNED(s))
15272 return;
15273 if (interned == NULL) {
15274 interned = PyDict_New();
15275 if (interned == NULL) {
15276 PyErr_Clear(); /* Don't leave an exception */
15277 return;
15278 }
15279 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015280 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015281 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015283 if (t == NULL) {
15284 PyErr_Clear();
15285 return;
15286 }
15287 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015288 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015289 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015290 return;
15291 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015292 /* The two references in interned are not counted by refcnt.
15293 The deallocator will take care of this */
15294 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015295 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015296}
15297
15298void
15299PyUnicode_InternImmortal(PyObject **p)
15300{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015301 PyUnicode_InternInPlace(p);
15302 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015303 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015304 Py_INCREF(*p);
15305 }
Walter Dörwald16807132007-05-25 13:52:07 +000015306}
15307
15308PyObject *
15309PyUnicode_InternFromString(const char *cp)
15310{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015311 PyObject *s = PyUnicode_FromString(cp);
15312 if (s == NULL)
15313 return NULL;
15314 PyUnicode_InternInPlace(&s);
15315 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015316}
15317
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015318
15319#if defined(WITH_VALGRIND) || defined(__INSURE__)
15320static void
15321unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015322{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015323 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015324 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015325 Py_ssize_t i, n;
15326 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015327
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 if (interned == NULL || !PyDict_Check(interned))
15329 return;
15330 keys = PyDict_Keys(interned);
15331 if (keys == NULL || !PyList_Check(keys)) {
15332 PyErr_Clear();
15333 return;
15334 }
Walter Dörwald16807132007-05-25 13:52:07 +000015335
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015336 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015337 detector, interned unicode strings are not forcibly deallocated;
15338 rather, we give them their stolen references back, and then clear
15339 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015340
Benjamin Peterson14339b62009-01-31 16:36:08 +000015341 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015342#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015344 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015345#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015346 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015347 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015348 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015349 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015350 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015351 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015352 case SSTATE_NOT_INTERNED:
15353 /* XXX Shouldn't happen */
15354 break;
15355 case SSTATE_INTERNED_IMMORTAL:
15356 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015357 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015358 break;
15359 case SSTATE_INTERNED_MORTAL:
15360 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015361 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015362 break;
15363 default:
15364 Py_FatalError("Inconsistent interned string state.");
15365 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015366 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015367 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015368#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 fprintf(stderr, "total size of all interned strings: "
15370 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15371 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015372#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015373 Py_DECREF(keys);
15374 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015375 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015376}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015377#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015378
15379
15380/********************* Unicode Iterator **************************/
15381
15382typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015383 PyObject_HEAD
15384 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015385 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015386} unicodeiterobject;
15387
15388static void
15389unicodeiter_dealloc(unicodeiterobject *it)
15390{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015391 _PyObject_GC_UNTRACK(it);
15392 Py_XDECREF(it->it_seq);
15393 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015394}
15395
15396static int
15397unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15398{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015399 Py_VISIT(it->it_seq);
15400 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015401}
15402
15403static PyObject *
15404unicodeiter_next(unicodeiterobject *it)
15405{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015406 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015407
Benjamin Peterson14339b62009-01-31 16:36:08 +000015408 assert(it != NULL);
15409 seq = it->it_seq;
15410 if (seq == NULL)
15411 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015412 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015413
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015414 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15415 int kind = PyUnicode_KIND(seq);
15416 void *data = PyUnicode_DATA(seq);
15417 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15418 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015419 if (item != NULL)
15420 ++it->it_index;
15421 return item;
15422 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015423
Benjamin Peterson14339b62009-01-31 16:36:08 +000015424 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015425 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015427}
15428
15429static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015430unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015431{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015432 Py_ssize_t len = 0;
15433 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015434 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015435 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015436}
15437
15438PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15439
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015440static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015441unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015442{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015443 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015444 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015445 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015446 it->it_seq, it->it_index);
15447 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015448 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015449 if (u == NULL)
15450 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015451 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015452 }
15453}
15454
15455PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15456
15457static PyObject *
15458unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15459{
15460 Py_ssize_t index = PyLong_AsSsize_t(state);
15461 if (index == -1 && PyErr_Occurred())
15462 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015463 if (it->it_seq != NULL) {
15464 if (index < 0)
15465 index = 0;
15466 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15467 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15468 it->it_index = index;
15469 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015470 Py_RETURN_NONE;
15471}
15472
15473PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15474
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015475static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015476 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015477 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015478 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15479 reduce_doc},
15480 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15481 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015482 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015483};
15484
15485PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015486 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15487 "str_iterator", /* tp_name */
15488 sizeof(unicodeiterobject), /* tp_basicsize */
15489 0, /* tp_itemsize */
15490 /* methods */
15491 (destructor)unicodeiter_dealloc, /* tp_dealloc */
15492 0, /* tp_print */
15493 0, /* tp_getattr */
15494 0, /* tp_setattr */
Mark Dickinsone94c6792009-02-02 20:36:42 +000015495 0, /* tp_reserved */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015496 0, /* tp_repr */
15497 0, /* tp_as_number */
15498 0, /* tp_as_sequence */
15499 0, /* tp_as_mapping */
15500 0, /* tp_hash */
15501 0, /* tp_call */
15502 0, /* tp_str */
15503 PyObject_GenericGetAttr, /* tp_getattro */
15504 0, /* tp_setattro */
15505 0, /* tp_as_buffer */
15506 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15507 0, /* tp_doc */
15508 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15509 0, /* tp_clear */
15510 0, /* tp_richcompare */
15511 0, /* tp_weaklistoffset */
15512 PyObject_SelfIter, /* tp_iter */
15513 (iternextfunc)unicodeiter_next, /* tp_iternext */
15514 unicodeiter_methods, /* tp_methods */
15515 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015516};
15517
15518static PyObject *
15519unicode_iter(PyObject *seq)
15520{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015521 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015522
Benjamin Peterson14339b62009-01-31 16:36:08 +000015523 if (!PyUnicode_Check(seq)) {
15524 PyErr_BadInternalCall();
15525 return NULL;
15526 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015527 if (PyUnicode_READY(seq) == -1)
15528 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015529 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15530 if (it == NULL)
15531 return NULL;
15532 it->it_index = 0;
15533 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015534 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015535 _PyObject_GC_TRACK(it);
15536 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015537}
15538
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015539
15540size_t
15541Py_UNICODE_strlen(const Py_UNICODE *u)
15542{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015543 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015544}
15545
15546Py_UNICODE*
15547Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15548{
15549 Py_UNICODE *u = s1;
15550 while ((*u++ = *s2++));
15551 return s1;
15552}
15553
15554Py_UNICODE*
15555Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15556{
15557 Py_UNICODE *u = s1;
15558 while ((*u++ = *s2++))
15559 if (n-- == 0)
15560 break;
15561 return s1;
15562}
15563
15564Py_UNICODE*
15565Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15566{
15567 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015568 u1 += wcslen(u1);
15569 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015570 return s1;
15571}
15572
15573int
15574Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15575{
15576 while (*s1 && *s2 && *s1 == *s2)
15577 s1++, s2++;
15578 if (*s1 && *s2)
15579 return (*s1 < *s2) ? -1 : +1;
15580 if (*s1)
15581 return 1;
15582 if (*s2)
15583 return -1;
15584 return 0;
15585}
15586
15587int
15588Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15589{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015590 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015591 for (; n != 0; n--) {
15592 u1 = *s1;
15593 u2 = *s2;
15594 if (u1 != u2)
15595 return (u1 < u2) ? -1 : +1;
15596 if (u1 == '\0')
15597 return 0;
15598 s1++;
15599 s2++;
15600 }
15601 return 0;
15602}
15603
15604Py_UNICODE*
15605Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15606{
15607 const Py_UNICODE *p;
15608 for (p = s; *p; p++)
15609 if (*p == c)
15610 return (Py_UNICODE*)p;
15611 return NULL;
15612}
15613
15614Py_UNICODE*
15615Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15616{
15617 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015618 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015619 while (p != s) {
15620 p--;
15621 if (*p == c)
15622 return (Py_UNICODE*)p;
15623 }
15624 return NULL;
15625}
Victor Stinner331ea922010-08-10 16:37:20 +000015626
Victor Stinner71133ff2010-09-01 23:43:53 +000015627Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015628PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015629{
Victor Stinner577db2c2011-10-11 22:12:48 +020015630 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015631 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015632
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015633 if (!PyUnicode_Check(unicode)) {
15634 PyErr_BadArgument();
15635 return NULL;
15636 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015637 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015638 if (u == NULL)
15639 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015640 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015641 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015642 PyErr_NoMemory();
15643 return NULL;
15644 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015645 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015646 size *= sizeof(Py_UNICODE);
15647 copy = PyMem_Malloc(size);
15648 if (copy == NULL) {
15649 PyErr_NoMemory();
15650 return NULL;
15651 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015652 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015653 return copy;
15654}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015655
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015656
Victor Stinner709d23d2019-05-02 14:56:30 -040015657static int
15658encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015659{
Victor Stinner709d23d2019-05-02 14:56:30 -040015660 int res;
15661 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15662 if (res == -2) {
15663 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15664 return -1;
15665 }
15666 if (res < 0) {
15667 PyErr_NoMemory();
15668 return -1;
15669 }
15670 return 0;
15671}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015672
Victor Stinner709d23d2019-05-02 14:56:30 -040015673
15674static int
15675config_get_codec_name(wchar_t **config_encoding)
15676{
15677 char *encoding;
15678 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15679 return -1;
15680 }
15681
15682 PyObject *name_obj = NULL;
15683 PyObject *codec = _PyCodec_Lookup(encoding);
15684 PyMem_RawFree(encoding);
15685
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015686 if (!codec)
15687 goto error;
15688
15689 name_obj = PyObject_GetAttrString(codec, "name");
15690 Py_CLEAR(codec);
15691 if (!name_obj) {
15692 goto error;
15693 }
15694
Victor Stinner709d23d2019-05-02 14:56:30 -040015695 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15696 Py_DECREF(name_obj);
15697 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015698 goto error;
15699 }
15700
Victor Stinner709d23d2019-05-02 14:56:30 -040015701 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15702 if (raw_wname == NULL) {
15703 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015704 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015705 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015706 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015707
15708 PyMem_RawFree(*config_encoding);
15709 *config_encoding = raw_wname;
15710
15711 PyMem_Free(wname);
15712 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015713
15714error:
15715 Py_XDECREF(codec);
15716 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015717 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015718}
15719
15720
Victor Stinner331a6a52019-05-27 16:39:22 +020015721static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015722init_stdio_encoding(PyInterpreterState *interp)
15723{
Victor Stinner709d23d2019-05-02 14:56:30 -040015724 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015725 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015726 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015727 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015728 "of the stdio encoding");
15729 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015730 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015731}
15732
15733
Victor Stinner709d23d2019-05-02 14:56:30 -040015734static int
15735init_fs_codec(PyInterpreterState *interp)
15736{
Victor Stinner331a6a52019-05-27 16:39:22 +020015737 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015738
15739 _Py_error_handler error_handler;
15740 error_handler = get_error_handler_wide(config->filesystem_errors);
15741 if (error_handler == _Py_ERROR_UNKNOWN) {
15742 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15743 return -1;
15744 }
15745
15746 char *encoding, *errors;
15747 if (encode_wstr_utf8(config->filesystem_encoding,
15748 &encoding,
15749 "filesystem_encoding") < 0) {
15750 return -1;
15751 }
15752
15753 if (encode_wstr_utf8(config->filesystem_errors,
15754 &errors,
15755 "filesystem_errors") < 0) {
15756 PyMem_RawFree(encoding);
15757 return -1;
15758 }
15759
15760 PyMem_RawFree(interp->fs_codec.encoding);
15761 interp->fs_codec.encoding = encoding;
15762 PyMem_RawFree(interp->fs_codec.errors);
15763 interp->fs_codec.errors = errors;
15764 interp->fs_codec.error_handler = error_handler;
15765
15766 /* At this point, PyUnicode_EncodeFSDefault() and
15767 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15768 the C implementation of the filesystem encoding. */
15769
15770 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15771 global configuration variables. */
15772 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15773 interp->fs_codec.errors) < 0) {
15774 PyErr_NoMemory();
15775 return -1;
15776 }
15777 return 0;
15778}
15779
15780
Victor Stinner331a6a52019-05-27 16:39:22 +020015781static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015782init_fs_encoding(PyInterpreterState *interp)
15783{
Victor Stinner709d23d2019-05-02 14:56:30 -040015784 /* Update the filesystem encoding to the normalized Python codec name.
15785 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15786 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015787 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015788 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015789 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015790 "of the filesystem encoding");
15791 }
15792
Victor Stinner709d23d2019-05-02 14:56:30 -040015793 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015794 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015795 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015796 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015797}
15798
15799
Victor Stinner331a6a52019-05-27 16:39:22 +020015800PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015801_PyUnicode_InitEncodings(PyInterpreterState *interp)
15802{
Victor Stinner331a6a52019-05-27 16:39:22 +020015803 PyStatus status = init_fs_encoding(interp);
15804 if (_PyStatus_EXCEPTION(status)) {
15805 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015806 }
15807
15808 return init_stdio_encoding(interp);
15809}
15810
15811
Victor Stinner709d23d2019-05-02 14:56:30 -040015812#ifdef MS_WINDOWS
15813int
15814_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15815{
15816 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015817 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015818
15819 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15820 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15821 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15822 if (encoding == NULL || errors == NULL) {
15823 PyMem_RawFree(encoding);
15824 PyMem_RawFree(errors);
15825 PyErr_NoMemory();
15826 return -1;
15827 }
15828
15829 PyMem_RawFree(config->filesystem_encoding);
15830 config->filesystem_encoding = encoding;
15831 PyMem_RawFree(config->filesystem_errors);
15832 config->filesystem_errors = errors;
15833
15834 return init_fs_codec(interp);
15835}
15836#endif
15837
15838
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015839void
15840_PyUnicode_Fini(void)
15841{
15842#if defined(WITH_VALGRIND) || defined(__INSURE__)
15843 /* Insure++ is a memory analysis tool that aids in discovering
15844 * memory leaks and other memory problems. On Python exit, the
15845 * interned string dictionaries are flagged as being in use at exit
15846 * (which it is). Under normal circumstances, this is fine because
15847 * the memory will be automatically reclaimed by the system. Under
15848 * memory debugging, it's a huge source of useless noise, so we
15849 * trade off slower shutdown for less distraction in the memory
15850 * reports. -baw
15851 */
15852 unicode_release_interned();
15853#endif /* __INSURE__ */
15854
15855 Py_CLEAR(unicode_empty);
15856
15857 for (Py_ssize_t i = 0; i < 256; i++) {
15858 Py_CLEAR(unicode_latin1[i]);
15859 }
15860 _PyUnicode_ClearStaticStrings();
15861 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015862
15863 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15864 PyMem_RawFree(interp->fs_codec.encoding);
15865 interp->fs_codec.encoding = NULL;
15866 PyMem_RawFree(interp->fs_codec.errors);
15867 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015868}
15869
15870
Georg Brandl66c221e2010-10-14 07:04:07 +000015871/* A _string module, to export formatter_parser and formatter_field_name_split
15872 to the string.Formatter class implemented in Python. */
15873
15874static PyMethodDef _string_methods[] = {
15875 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15876 METH_O, PyDoc_STR("split the argument as a field name")},
15877 {"formatter_parser", (PyCFunction) formatter_parser,
15878 METH_O, PyDoc_STR("parse the argument as a format string")},
15879 {NULL, NULL}
15880};
15881
15882static struct PyModuleDef _string_module = {
15883 PyModuleDef_HEAD_INIT,
15884 "_string",
15885 PyDoc_STR("string helper module"),
15886 0,
15887 _string_methods,
15888 NULL,
15889 NULL,
15890 NULL,
15891 NULL
15892};
15893
15894PyMODINIT_FUNC
15895PyInit__string(void)
15896{
15897 return PyModule_Create(&_string_module);
15898}
15899
15900
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015901#ifdef __cplusplus
15902}
15903#endif