blob: c28f36fb4618b5e71df25b9d398be0f60f1e90a7 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinner331a6a52019-05-27 16:39:22 +020043#include "pycore_initconfig.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010044#include "pycore_fileutils.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010045#include "pycore_object.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040046#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010047#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000048#include "ucnhash.h"
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050049#include "bytes_methods.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070050#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000051
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000052#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000053#include <windows.h>
54#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000055
Victor Stinnerfecc4f22019-03-19 14:20:29 +010056/* Uncomment to display statistics on interned strings at exit when
57 using Valgrind or Insecure++. */
58/* #define INTERNED_STATS 1 */
59
60
Larry Hastings61272b72014-01-07 12:41:53 -080061/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090062class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080063[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090064/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
65
66/*[python input]
67class Py_UCS4_converter(CConverter):
68 type = 'Py_UCS4'
69 converter = 'convert_uc'
70
71 def converter_init(self):
72 if self.default is not unspecified:
73 self.c_default = ascii(self.default)
74 if len(self.c_default) > 4 or self.c_default[0] != "'":
75 self.c_default = hex(ord(self.default))
76
77[python start generated code]*/
78/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080079
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000080/* --- Globals ------------------------------------------------------------
81
Serhiy Storchaka05997252013-01-26 12:14:02 +020082NOTE: In the interpreter's initialization phase, some globals are currently
83 initialized dynamically as needed. In the process Unicode objects may
84 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000085
86*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000087
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000088
89#ifdef __cplusplus
90extern "C" {
91#endif
92
Victor Stinner8faf8212011-12-08 22:14:11 +010093/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
94#define MAX_UNICODE 0x10ffff
95
Victor Stinner910337b2011-10-03 03:20:16 +020096#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020097# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +020098#else
99# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
100#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200101
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200102#define _PyUnicode_UTF8(op) \
103 (((PyCompactUnicodeObject*)(op))->utf8)
104#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200105 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200106 assert(PyUnicode_IS_READY(op)), \
107 PyUnicode_IS_COMPACT_ASCII(op) ? \
108 ((char*)((PyASCIIObject*)(op) + 1)) : \
109 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200110#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200111 (((PyCompactUnicodeObject*)(op))->utf8_length)
112#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200113 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200114 assert(PyUnicode_IS_READY(op)), \
115 PyUnicode_IS_COMPACT_ASCII(op) ? \
116 ((PyASCIIObject*)(op))->length : \
117 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200118#define _PyUnicode_WSTR(op) \
119 (((PyASCIIObject*)(op))->wstr)
120#define _PyUnicode_WSTR_LENGTH(op) \
121 (((PyCompactUnicodeObject*)(op))->wstr_length)
122#define _PyUnicode_LENGTH(op) \
123 (((PyASCIIObject *)(op))->length)
124#define _PyUnicode_STATE(op) \
125 (((PyASCIIObject *)(op))->state)
126#define _PyUnicode_HASH(op) \
127 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200128#define _PyUnicode_KIND(op) \
129 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200130 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200131#define _PyUnicode_GET_LENGTH(op) \
132 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200133 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200134#define _PyUnicode_DATA_ANY(op) \
135 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200136
Victor Stinner910337b2011-10-03 03:20:16 +0200137#undef PyUnicode_READY
138#define PyUnicode_READY(op) \
139 (assert(_PyUnicode_CHECK(op)), \
140 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200141 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100142 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200143
Victor Stinnerc379ead2011-10-03 12:52:27 +0200144#define _PyUnicode_SHARE_UTF8(op) \
145 (assert(_PyUnicode_CHECK(op)), \
146 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
147 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
148#define _PyUnicode_SHARE_WSTR(op) \
149 (assert(_PyUnicode_CHECK(op)), \
150 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
151
Victor Stinner829c0ad2011-10-03 01:08:02 +0200152/* true if the Unicode object has an allocated UTF-8 memory block
153 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200154#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200155 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200156 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200157 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
158
Victor Stinner03490912011-10-03 23:45:12 +0200159/* true if the Unicode object has an allocated wstr memory block
160 (not shared with other data) */
161#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200162 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200163 (!PyUnicode_IS_READY(op) || \
164 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
165
Victor Stinner910337b2011-10-03 03:20:16 +0200166/* Generic helper macro to convert characters of different types.
167 from_type and to_type have to be valid type names, begin and end
168 are pointers to the source characters which should be of type
169 "from_type *". to is a pointer of type "to_type *" and points to the
170 buffer where the result characters are written to. */
171#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
172 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100173 to_type *_to = (to_type *)(to); \
174 const from_type *_iter = (from_type *)(begin); \
175 const from_type *_end = (from_type *)(end); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200176 Py_ssize_t n = (_end) - (_iter); \
177 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200178 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200179 while (_iter < (_unrolled_end)) { \
180 _to[0] = (to_type) _iter[0]; \
181 _to[1] = (to_type) _iter[1]; \
182 _to[2] = (to_type) _iter[2]; \
183 _to[3] = (to_type) _iter[3]; \
184 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200185 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200186 while (_iter < (_end)) \
187 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200188 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200189
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200190#ifdef MS_WINDOWS
191 /* On Windows, overallocate by 50% is the best factor */
192# define OVERALLOCATE_FACTOR 2
193#else
194 /* On Linux, overallocate by 25% is the best factor */
195# define OVERALLOCATE_FACTOR 4
196#endif
197
Walter Dörwald16807132007-05-25 13:52:07 +0000198/* This dictionary holds all interned unicode strings. Note that references
199 to strings in this dictionary are *not* counted in the string's ob_refcnt.
200 When the interned string reaches a refcnt of 0 the string deallocation
201 function will delete the reference from this dictionary.
202
203 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000204 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000205*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200206static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000207
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000208/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200209static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200210
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212 do { \
213 if (unicode_empty != NULL) \
214 Py_INCREF(unicode_empty); \
215 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200216 unicode_empty = PyUnicode_New(0, 0); \
217 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200218 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200219 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
220 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200221 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200222 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000223
Serhiy Storchaka678db842013-01-26 12:16:36 +0200224#define _Py_RETURN_UNICODE_EMPTY() \
225 do { \
226 _Py_INCREF_UNICODE_EMPTY(); \
227 return unicode_empty; \
228 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000229
Victor Stinner59423e32018-11-26 13:40:01 +0100230static inline void
231unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
232 Py_ssize_t start, Py_ssize_t length)
233{
234 assert(0 <= start);
235 assert(kind != PyUnicode_WCHAR_KIND);
236 switch (kind) {
237 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100238 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100239 Py_UCS1 ch = (unsigned char)value;
240 Py_UCS1 *to = (Py_UCS1 *)data + start;
241 memset(to, ch, length);
242 break;
243 }
244 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100245 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100246 Py_UCS2 ch = (Py_UCS2)value;
247 Py_UCS2 *to = (Py_UCS2 *)data + start;
248 const Py_UCS2 *end = to + length;
249 for (; to < end; ++to) *to = ch;
250 break;
251 }
252 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100253 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100254 Py_UCS4 ch = value;
255 Py_UCS4 * to = (Py_UCS4 *)data + start;
256 const Py_UCS4 *end = to + length;
257 for (; to < end; ++to) *to = ch;
258 break;
259 }
260 default: Py_UNREACHABLE();
261 }
262}
263
264
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200265/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700266static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Victor Stinner709d23d2019-05-02 14:56:30 -0400268static PyObject *
269unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
270 const char *errors);
271static PyObject *
272unicode_decode_utf8(const char *s, Py_ssize_t size,
273 _Py_error_handler error_handler, const char *errors,
274 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200275
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200276/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200277static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200278
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000279/* Single character Unicode strings in the Latin-1 range are being
280 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200281static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000282
Christian Heimes190d79e2008-01-30 11:58:22 +0000283/* Fast detection of the most frequent whitespace characters */
284const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000285 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000286/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000287/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000288/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000289/* case 0x000C: * FORM FEED */
290/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000291 0, 1, 1, 1, 1, 1, 0, 0,
292 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* case 0x001C: * FILE SEPARATOR */
294/* case 0x001D: * GROUP SEPARATOR */
295/* case 0x001E: * RECORD SEPARATOR */
296/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000297 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000298/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000299 1, 0, 0, 0, 0, 0, 0, 0,
300 0, 0, 0, 0, 0, 0, 0, 0,
301 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000303
Benjamin Peterson14339b62009-01-31 16:36:08 +0000304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
307 0, 0, 0, 0, 0, 0, 0, 0,
308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000312};
313
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200314/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200315static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200316static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100317static int unicode_modifiable(PyObject *unicode);
318
Victor Stinnerfe226c02011-10-03 03:52:20 +0200319
Alexander Belopolsky40018472011-02-26 01:02:56 +0000320static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100321_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200322static PyObject *
323_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
324static PyObject *
325_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
326
327static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000328unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000329 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100330 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000331 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
332
Alexander Belopolsky40018472011-02-26 01:02:56 +0000333static void
334raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300335 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100336 PyObject *unicode,
337 Py_ssize_t startpos, Py_ssize_t endpos,
338 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000339
Christian Heimes190d79e2008-01-30 11:58:22 +0000340/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200341static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000342 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000343/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000344/* 0x000B, * LINE TABULATION */
345/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000346/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000347 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000348 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000349/* 0x001C, * FILE SEPARATOR */
350/* 0x001D, * GROUP SEPARATOR */
351/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000352 0, 0, 0, 0, 1, 1, 1, 0,
353 0, 0, 0, 0, 0, 0, 0, 0,
354 0, 0, 0, 0, 0, 0, 0, 0,
355 0, 0, 0, 0, 0, 0, 0, 0,
356 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000357
Benjamin Peterson14339b62009-01-31 16:36:08 +0000358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
361 0, 0, 0, 0, 0, 0, 0, 0,
362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000366};
367
INADA Naoki3ae20562017-01-16 20:41:20 +0900368static int convert_uc(PyObject *obj, void *addr);
369
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300370#include "clinic/unicodeobject.c.h"
371
Victor Stinner3d4226a2018-08-29 22:21:32 +0200372_Py_error_handler
373_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200374{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200375 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200376 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200377 }
378 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200379 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200380 }
381 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200382 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200383 }
384 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200385 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200386 }
387 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200388 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200389 }
390 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200391 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200392 }
393 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200394 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200395 }
Victor Stinner50149202015-09-22 00:26:54 +0200396 return _Py_ERROR_OTHER;
397}
398
Victor Stinner709d23d2019-05-02 14:56:30 -0400399
400static _Py_error_handler
401get_error_handler_wide(const wchar_t *errors)
402{
403 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
404 return _Py_ERROR_STRICT;
405 }
406 if (wcscmp(errors, L"surrogateescape") == 0) {
407 return _Py_ERROR_SURROGATEESCAPE;
408 }
409 if (wcscmp(errors, L"replace") == 0) {
410 return _Py_ERROR_REPLACE;
411 }
412 if (wcscmp(errors, L"ignore") == 0) {
413 return _Py_ERROR_IGNORE;
414 }
415 if (wcscmp(errors, L"backslashreplace") == 0) {
416 return _Py_ERROR_BACKSLASHREPLACE;
417 }
418 if (wcscmp(errors, L"surrogatepass") == 0) {
419 return _Py_ERROR_SURROGATEPASS;
420 }
421 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
422 return _Py_ERROR_XMLCHARREFREPLACE;
423 }
424 return _Py_ERROR_OTHER;
425}
426
427
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300428/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
429 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000430Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000431PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000432{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000433#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000434 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000435#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000436 /* This is actually an illegal character, so it should
437 not be passed to unichr. */
438 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000439#endif
440}
441
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200442int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100443_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200444{
445 PyASCIIObject *ascii;
446 unsigned int kind;
447
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200448 _PyObject_ASSERT(op, PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200449
450 ascii = (PyASCIIObject *)op;
451 kind = ascii->state.kind;
452
Victor Stinnera3b334d2011-10-03 13:53:37 +0200453 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200454 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND);
455 _PyObject_ASSERT(op, ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200456 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200457 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200458 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200459 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200460
Victor Stinnera41463c2011-10-04 01:05:08 +0200461 if (ascii->state.compact == 1) {
462 data = compact + 1;
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200463 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
464 || kind == PyUnicode_2BYTE_KIND
465 || kind == PyUnicode_4BYTE_KIND);
466 _PyObject_ASSERT(op, ascii->state.ascii == 0);
467 _PyObject_ASSERT(op, ascii->state.ready == 1);
468 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100469 }
470 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200471 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
472
473 data = unicode->data.any;
474 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200475 _PyObject_ASSERT(op, ascii->length == 0);
476 _PyObject_ASSERT(op, ascii->hash == -1);
477 _PyObject_ASSERT(op, ascii->state.compact == 0);
478 _PyObject_ASSERT(op, ascii->state.ascii == 0);
479 _PyObject_ASSERT(op, ascii->state.ready == 0);
480 _PyObject_ASSERT(op, ascii->state.interned == SSTATE_NOT_INTERNED);
481 _PyObject_ASSERT(op, ascii->wstr != NULL);
482 _PyObject_ASSERT(op, data == NULL);
483 _PyObject_ASSERT(op, compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200484 }
485 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200486 _PyObject_ASSERT(op, kind == PyUnicode_1BYTE_KIND
487 || kind == PyUnicode_2BYTE_KIND
488 || kind == PyUnicode_4BYTE_KIND);
489 _PyObject_ASSERT(op, ascii->state.compact == 0);
490 _PyObject_ASSERT(op, ascii->state.ready == 1);
491 _PyObject_ASSERT(op, data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200492 if (ascii->state.ascii) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200493 _PyObject_ASSERT(op, compact->utf8 == data);
494 _PyObject_ASSERT(op, compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200495 }
496 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200497 _PyObject_ASSERT(op, compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200498 }
499 }
500 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200501 if (
502#if SIZEOF_WCHAR_T == 2
503 kind == PyUnicode_2BYTE_KIND
504#else
505 kind == PyUnicode_4BYTE_KIND
506#endif
507 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200508 {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200509 _PyObject_ASSERT(op, ascii->wstr == data);
510 _PyObject_ASSERT(op, compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200511 } else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200512 _PyObject_ASSERT(op, ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200513 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200514
515 if (compact->utf8 == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200516 _PyObject_ASSERT(op, compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 if (ascii->wstr == NULL)
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200518 _PyObject_ASSERT(op, compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200519 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520
521 /* check that the best kind is used: O(n) operation */
522 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200523 Py_ssize_t i;
524 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200525 void *data;
526 Py_UCS4 ch;
527
528 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200529 for (i=0; i < ascii->length; i++)
530 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200531 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200532 if (ch > maxchar)
533 maxchar = ch;
534 }
535 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100536 if (ascii->state.ascii == 0) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200537 _PyObject_ASSERT(op, maxchar >= 128);
538 _PyObject_ASSERT(op, maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100539 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200540 else
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200541 _PyObject_ASSERT(op, maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200542 }
Victor Stinner77faf692011-11-20 18:56:05 +0100543 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200544 _PyObject_ASSERT(op, maxchar >= 0x100);
545 _PyObject_ASSERT(op, maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100546 }
547 else {
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200548 _PyObject_ASSERT(op, maxchar >= 0x10000);
549 _PyObject_ASSERT(op, maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100550 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200551 _PyObject_ASSERT(op, PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200552 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400553 return 1;
554}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200555
Victor Stinner910337b2011-10-03 03:20:16 +0200556
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100557static PyObject*
558unicode_result_wchar(PyObject *unicode)
559{
560#ifndef Py_DEBUG
561 Py_ssize_t len;
562
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100563 len = _PyUnicode_WSTR_LENGTH(unicode);
564 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100565 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200566 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100567 }
568
569 if (len == 1) {
570 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100571 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100572 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
573 Py_DECREF(unicode);
574 return latin1_char;
575 }
576 }
577
578 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200579 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100580 return NULL;
581 }
582#else
Victor Stinneraa771272012-10-04 02:32:58 +0200583 assert(Py_REFCNT(unicode) == 1);
584
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100585 /* don't make the result ready in debug mode to ensure that the caller
586 makes the string ready before using it */
587 assert(_PyUnicode_CheckConsistency(unicode, 1));
588#endif
589 return unicode;
590}
591
592static PyObject*
593unicode_result_ready(PyObject *unicode)
594{
595 Py_ssize_t length;
596
597 length = PyUnicode_GET_LENGTH(unicode);
598 if (length == 0) {
599 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100600 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200601 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100602 }
603 return unicode_empty;
604 }
605
606 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200607 void *data = PyUnicode_DATA(unicode);
608 int kind = PyUnicode_KIND(unicode);
609 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100610 if (ch < 256) {
611 PyObject *latin1_char = unicode_latin1[ch];
612 if (latin1_char != NULL) {
613 if (unicode != latin1_char) {
614 Py_INCREF(latin1_char);
615 Py_DECREF(unicode);
616 }
617 return latin1_char;
618 }
619 else {
620 assert(_PyUnicode_CheckConsistency(unicode, 1));
621 Py_INCREF(unicode);
622 unicode_latin1[ch] = unicode;
623 return unicode;
624 }
625 }
626 }
627
628 assert(_PyUnicode_CheckConsistency(unicode, 1));
629 return unicode;
630}
631
632static PyObject*
633unicode_result(PyObject *unicode)
634{
635 assert(_PyUnicode_CHECK(unicode));
636 if (PyUnicode_IS_READY(unicode))
637 return unicode_result_ready(unicode);
638 else
639 return unicode_result_wchar(unicode);
640}
641
Victor Stinnerc4b49542011-12-11 22:44:26 +0100642static PyObject*
643unicode_result_unchanged(PyObject *unicode)
644{
645 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500646 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100647 return NULL;
648 Py_INCREF(unicode);
649 return unicode;
650 }
651 else
652 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100653 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100654}
655
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200656/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
657 ASCII, Latin1, UTF-8, etc. */
658static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200659backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200660 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
661{
Victor Stinnerad771582015-10-09 12:38:53 +0200662 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200663 Py_UCS4 ch;
664 enum PyUnicode_Kind kind;
665 void *data;
666
667 assert(PyUnicode_IS_READY(unicode));
668 kind = PyUnicode_KIND(unicode);
669 data = PyUnicode_DATA(unicode);
670
671 size = 0;
672 /* determine replacement size */
673 for (i = collstart; i < collend; ++i) {
674 Py_ssize_t incr;
675
676 ch = PyUnicode_READ(kind, data, i);
677 if (ch < 0x100)
678 incr = 2+2;
679 else if (ch < 0x10000)
680 incr = 2+4;
681 else {
682 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200683 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200684 }
685 if (size > PY_SSIZE_T_MAX - incr) {
686 PyErr_SetString(PyExc_OverflowError,
687 "encoded result is too long for a Python string");
688 return NULL;
689 }
690 size += incr;
691 }
692
Victor Stinnerad771582015-10-09 12:38:53 +0200693 str = _PyBytesWriter_Prepare(writer, str, size);
694 if (str == NULL)
695 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200696
697 /* generate replacement */
698 for (i = collstart; i < collend; ++i) {
699 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200700 *str++ = '\\';
701 if (ch >= 0x00010000) {
702 *str++ = 'U';
703 *str++ = Py_hexdigits[(ch>>28)&0xf];
704 *str++ = Py_hexdigits[(ch>>24)&0xf];
705 *str++ = Py_hexdigits[(ch>>20)&0xf];
706 *str++ = Py_hexdigits[(ch>>16)&0xf];
707 *str++ = Py_hexdigits[(ch>>12)&0xf];
708 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200709 }
Victor Stinner797485e2015-10-09 03:17:30 +0200710 else if (ch >= 0x100) {
711 *str++ = 'u';
712 *str++ = Py_hexdigits[(ch>>12)&0xf];
713 *str++ = Py_hexdigits[(ch>>8)&0xf];
714 }
715 else
716 *str++ = 'x';
717 *str++ = Py_hexdigits[(ch>>4)&0xf];
718 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200719 }
720 return str;
721}
722
723/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
724 ASCII, Latin1, UTF-8, etc. */
725static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200726xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200727 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
728{
Victor Stinnerad771582015-10-09 12:38:53 +0200729 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200730 Py_UCS4 ch;
731 enum PyUnicode_Kind kind;
732 void *data;
733
734 assert(PyUnicode_IS_READY(unicode));
735 kind = PyUnicode_KIND(unicode);
736 data = PyUnicode_DATA(unicode);
737
738 size = 0;
739 /* determine replacement size */
740 for (i = collstart; i < collend; ++i) {
741 Py_ssize_t incr;
742
743 ch = PyUnicode_READ(kind, data, i);
744 if (ch < 10)
745 incr = 2+1+1;
746 else if (ch < 100)
747 incr = 2+2+1;
748 else if (ch < 1000)
749 incr = 2+3+1;
750 else if (ch < 10000)
751 incr = 2+4+1;
752 else if (ch < 100000)
753 incr = 2+5+1;
754 else if (ch < 1000000)
755 incr = 2+6+1;
756 else {
757 assert(ch <= MAX_UNICODE);
758 incr = 2+7+1;
759 }
760 if (size > PY_SSIZE_T_MAX - incr) {
761 PyErr_SetString(PyExc_OverflowError,
762 "encoded result is too long for a Python string");
763 return NULL;
764 }
765 size += incr;
766 }
767
Victor Stinnerad771582015-10-09 12:38:53 +0200768 str = _PyBytesWriter_Prepare(writer, str, size);
769 if (str == NULL)
770 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200771
772 /* generate replacement */
773 for (i = collstart; i < collend; ++i) {
774 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
775 }
776 return str;
777}
778
Thomas Wouters477c8d52006-05-27 19:21:47 +0000779/* --- Bloom Filters ----------------------------------------------------- */
780
781/* stuff to implement simple "bloom filters" for Unicode characters.
782 to keep things simple, we use a single bitmask, using the least 5
783 bits from each unicode characters as the bit index. */
784
785/* the linebreak mask is set up by Unicode_Init below */
786
Antoine Pitrouf068f942010-01-13 14:19:12 +0000787#if LONG_BIT >= 128
788#define BLOOM_WIDTH 128
789#elif LONG_BIT >= 64
790#define BLOOM_WIDTH 64
791#elif LONG_BIT >= 32
792#define BLOOM_WIDTH 32
793#else
794#error "LONG_BIT is smaller than 32"
795#endif
796
Thomas Wouters477c8d52006-05-27 19:21:47 +0000797#define BLOOM_MASK unsigned long
798
Serhiy Storchaka05997252013-01-26 12:14:02 +0200799static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000800
Antoine Pitrouf068f942010-01-13 14:19:12 +0000801#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000802
Benjamin Peterson29060642009-01-31 22:14:21 +0000803#define BLOOM_LINEBREAK(ch) \
804 ((ch) < 128U ? ascii_linebreak[(ch)] : \
805 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000806
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700807static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200808make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000809{
Victor Stinnera85af502013-04-09 21:53:54 +0200810#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
811 do { \
812 TYPE *data = (TYPE *)PTR; \
813 TYPE *end = data + LEN; \
814 Py_UCS4 ch; \
815 for (; data != end; data++) { \
816 ch = *data; \
817 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
818 } \
819 break; \
820 } while (0)
821
Thomas Wouters477c8d52006-05-27 19:21:47 +0000822 /* calculate simple bloom-style bitmask for a given unicode string */
823
Antoine Pitrouf068f942010-01-13 14:19:12 +0000824 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000825
826 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200827 switch (kind) {
828 case PyUnicode_1BYTE_KIND:
829 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
830 break;
831 case PyUnicode_2BYTE_KIND:
832 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
833 break;
834 case PyUnicode_4BYTE_KIND:
835 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
836 break;
837 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700838 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200839 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000840 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200841
842#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000843}
844
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300845static int
846ensure_unicode(PyObject *obj)
847{
848 if (!PyUnicode_Check(obj)) {
849 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200850 "must be str, not %.100s",
851 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300852 return -1;
853 }
854 return PyUnicode_READY(obj);
855}
856
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200857/* Compilation of templated routines */
858
859#include "stringlib/asciilib.h"
860#include "stringlib/fastsearch.h"
861#include "stringlib/partition.h"
862#include "stringlib/split.h"
863#include "stringlib/count.h"
864#include "stringlib/find.h"
865#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200866#include "stringlib/undef.h"
867
868#include "stringlib/ucs1lib.h"
869#include "stringlib/fastsearch.h"
870#include "stringlib/partition.h"
871#include "stringlib/split.h"
872#include "stringlib/count.h"
873#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300874#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200875#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200876#include "stringlib/undef.h"
877
878#include "stringlib/ucs2lib.h"
879#include "stringlib/fastsearch.h"
880#include "stringlib/partition.h"
881#include "stringlib/split.h"
882#include "stringlib/count.h"
883#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300884#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200885#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200886#include "stringlib/undef.h"
887
888#include "stringlib/ucs4lib.h"
889#include "stringlib/fastsearch.h"
890#include "stringlib/partition.h"
891#include "stringlib/split.h"
892#include "stringlib/count.h"
893#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300894#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200895#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200896#include "stringlib/undef.h"
897
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200898#include "stringlib/unicodedefs.h"
899#include "stringlib/fastsearch.h"
900#include "stringlib/count.h"
901#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100902#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200903
Guido van Rossumd57fd912000-03-10 22:53:23 +0000904/* --- Unicode Object ----------------------------------------------------- */
905
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700906static inline Py_ssize_t
907findchar(const void *s, int kind,
908 Py_ssize_t size, Py_UCS4 ch,
909 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200910{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200911 switch (kind) {
912 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200913 if ((Py_UCS1) ch != ch)
914 return -1;
915 if (direction > 0)
916 return ucs1lib_find_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
917 else
918 return ucs1lib_rfind_char((Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200919 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200920 if ((Py_UCS2) ch != ch)
921 return -1;
922 if (direction > 0)
923 return ucs2lib_find_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
924 else
925 return ucs2lib_rfind_char((Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200926 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200927 if (direction > 0)
928 return ucs4lib_find_char((Py_UCS4 *) s, size, ch);
929 else
930 return ucs4lib_rfind_char((Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200931 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700932 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200933 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200934}
935
Victor Stinnerafffce42012-10-03 23:03:17 +0200936#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000937/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200938 earlier.
939
940 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
941 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
942 invalid character in Unicode 6.0. */
943static void
944unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
945{
946 int kind = PyUnicode_KIND(unicode);
947 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
948 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
949 if (length <= old_length)
950 return;
951 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
952}
953#endif
954
Victor Stinnerfe226c02011-10-03 03:52:20 +0200955static PyObject*
956resize_compact(PyObject *unicode, Py_ssize_t length)
957{
958 Py_ssize_t char_size;
959 Py_ssize_t struct_size;
960 Py_ssize_t new_size;
961 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +0100962 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +0200963#ifdef Py_DEBUG
964 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
965#endif
966
Victor Stinner79891572012-05-03 13:43:07 +0200967 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +0200968 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +0100969 assert(PyUnicode_IS_COMPACT(unicode));
970
Martin v. Löwisc47adb02011-10-07 20:55:35 +0200971 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +0100972 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +0200973 struct_size = sizeof(PyASCIIObject);
974 else
975 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +0200976 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200977
Victor Stinnerfe226c02011-10-03 03:52:20 +0200978 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
979 PyErr_NoMemory();
980 return NULL;
981 }
982 new_size = (struct_size + (length + 1) * char_size);
983
Serhiy Storchaka7aa69082015-12-03 01:02:03 +0200984 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
985 PyObject_DEL(_PyUnicode_UTF8(unicode));
986 _PyUnicode_UTF8(unicode) = NULL;
987 _PyUnicode_UTF8_LENGTH(unicode) = 0;
988 }
Victor Stinner84def372011-12-11 20:04:56 +0100989 _Py_DEC_REFTOTAL;
990 _Py_ForgetReference(unicode);
991
Serhiy Storchaka20b39b22014-09-28 11:27:24 +0300992 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +0100993 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +0100994 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +0200995 PyErr_NoMemory();
996 return NULL;
997 }
Victor Stinner84def372011-12-11 20:04:56 +0100998 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +0200999 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001000
Victor Stinnerfe226c02011-10-03 03:52:20 +02001001 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001002 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001003 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001004 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001005 _PyUnicode_WSTR_LENGTH(unicode) = length;
1006 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001007 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1008 PyObject_DEL(_PyUnicode_WSTR(unicode));
1009 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001010 if (!PyUnicode_IS_ASCII(unicode))
1011 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001012 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001013#ifdef Py_DEBUG
1014 unicode_fill_invalid(unicode, old_length);
1015#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001016 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1017 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001018 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001019 return unicode;
1020}
1021
Alexander Belopolsky40018472011-02-26 01:02:56 +00001022static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001023resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001024{
Victor Stinner95663112011-10-04 01:03:50 +02001025 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001026 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001027 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001028 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001029
Victor Stinnerfe226c02011-10-03 03:52:20 +02001030 if (PyUnicode_IS_READY(unicode)) {
1031 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001032 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001033 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001034#ifdef Py_DEBUG
1035 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1036#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001037
1038 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001039 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001040 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1041 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001042
1043 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1044 PyErr_NoMemory();
1045 return -1;
1046 }
1047 new_size = (length + 1) * char_size;
1048
Victor Stinner7a9105a2011-12-12 00:13:42 +01001049 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1050 {
1051 PyObject_DEL(_PyUnicode_UTF8(unicode));
1052 _PyUnicode_UTF8(unicode) = NULL;
1053 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1054 }
1055
Victor Stinnerfe226c02011-10-03 03:52:20 +02001056 data = (PyObject *)PyObject_REALLOC(data, new_size);
1057 if (data == NULL) {
1058 PyErr_NoMemory();
1059 return -1;
1060 }
1061 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001062 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 _PyUnicode_WSTR_LENGTH(unicode) = length;
1065 }
1066 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001067 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001068 _PyUnicode_UTF8_LENGTH(unicode) = length;
1069 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001070 _PyUnicode_LENGTH(unicode) = length;
1071 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001072#ifdef Py_DEBUG
1073 unicode_fill_invalid(unicode, old_length);
1074#endif
Victor Stinner95663112011-10-04 01:03:50 +02001075 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001076 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001077 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001079 }
Victor Stinner95663112011-10-04 01:03:50 +02001080 assert(_PyUnicode_WSTR(unicode) != NULL);
1081
1082 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001083 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001084 PyErr_NoMemory();
1085 return -1;
1086 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001087 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001088 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001089 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001090 if (!wstr) {
1091 PyErr_NoMemory();
1092 return -1;
1093 }
1094 _PyUnicode_WSTR(unicode) = wstr;
1095 _PyUnicode_WSTR(unicode)[length] = 0;
1096 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001097 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001098 return 0;
1099}
1100
Victor Stinnerfe226c02011-10-03 03:52:20 +02001101static PyObject*
1102resize_copy(PyObject *unicode, Py_ssize_t length)
1103{
1104 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001105 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001106 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001107
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001108 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001109
1110 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1111 if (copy == NULL)
1112 return NULL;
1113
1114 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001115 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001116 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001117 }
1118 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001119 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001120
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001121 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001122 if (w == NULL)
1123 return NULL;
1124 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1125 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001126 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001127 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001128 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001129 }
1130}
1131
Guido van Rossumd57fd912000-03-10 22:53:23 +00001132/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001133 Ux0000 terminated; some code (e.g. new_identifier)
1134 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001135
1136 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001137 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001138
1139*/
1140
Alexander Belopolsky40018472011-02-26 01:02:56 +00001141static PyUnicodeObject *
1142_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001143{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001144 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001145 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001146
Thomas Wouters477c8d52006-05-27 19:21:47 +00001147 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001148 if (length == 0 && unicode_empty != NULL) {
1149 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001150 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001151 }
1152
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001153 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001154 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001155 return (PyUnicodeObject *)PyErr_NoMemory();
1156 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001157 if (length < 0) {
1158 PyErr_SetString(PyExc_SystemError,
1159 "Negative size passed to _PyUnicode_New");
1160 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001161 }
1162
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001163 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1164 if (unicode == NULL)
1165 return NULL;
1166 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001167
1168 _PyUnicode_WSTR_LENGTH(unicode) = length;
1169 _PyUnicode_HASH(unicode) = -1;
1170 _PyUnicode_STATE(unicode).interned = 0;
1171 _PyUnicode_STATE(unicode).kind = 0;
1172 _PyUnicode_STATE(unicode).compact = 0;
1173 _PyUnicode_STATE(unicode).ready = 0;
1174 _PyUnicode_STATE(unicode).ascii = 0;
1175 _PyUnicode_DATA_ANY(unicode) = NULL;
1176 _PyUnicode_LENGTH(unicode) = 0;
1177 _PyUnicode_UTF8(unicode) = NULL;
1178 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1179
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001180 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1181 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001182 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001183 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001184 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001185 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001186
Jeremy Hyltond8082792003-09-16 19:41:39 +00001187 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001188 * the caller fails before initializing str -- unicode_resize()
1189 * reads str[0], and the Keep-Alive optimization can keep memory
1190 * allocated for str alive across a call to unicode_dealloc(unicode).
1191 * We don't want unicode_resize to read uninitialized memory in
1192 * that case.
1193 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001194 _PyUnicode_WSTR(unicode)[0] = 0;
1195 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001196
Victor Stinner7931d9a2011-11-04 00:22:48 +01001197 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001198 return unicode;
1199}
1200
Victor Stinnerf42dc442011-10-02 23:33:16 +02001201static const char*
1202unicode_kind_name(PyObject *unicode)
1203{
Victor Stinner42dfd712011-10-03 14:41:45 +02001204 /* don't check consistency: unicode_kind_name() is called from
1205 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001206 if (!PyUnicode_IS_COMPACT(unicode))
1207 {
1208 if (!PyUnicode_IS_READY(unicode))
1209 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001210 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001211 {
1212 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001213 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001214 return "legacy ascii";
1215 else
1216 return "legacy latin1";
1217 case PyUnicode_2BYTE_KIND:
1218 return "legacy UCS2";
1219 case PyUnicode_4BYTE_KIND:
1220 return "legacy UCS4";
1221 default:
1222 return "<legacy invalid kind>";
1223 }
1224 }
1225 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001226 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001227 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001228 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001229 return "ascii";
1230 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001231 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001232 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001233 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001234 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001235 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001236 default:
1237 return "<invalid compact kind>";
1238 }
1239}
1240
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001241#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001243char *_PyUnicode_utf8(void *unicode_raw){
1244 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001245 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001246}
1247
Victor Stinnera42de742018-11-22 10:25:22 +01001248void *_PyUnicode_compact_data(void *unicode_raw) {
1249 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001250 return _PyUnicode_COMPACT_DATA(unicode);
1251}
Victor Stinnera42de742018-11-22 10:25:22 +01001252void *_PyUnicode_data(void *unicode_raw) {
1253 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001254 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001255 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1256 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1257 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1258 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1259 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1260 return PyUnicode_DATA(unicode);
1261}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001262
1263void
1264_PyUnicode_Dump(PyObject *op)
1265{
1266 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001267 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1268 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1269 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001270
Victor Stinnera849a4b2011-10-03 12:12:11 +02001271 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001272 {
1273 if (ascii->state.ascii)
1274 data = (ascii + 1);
1275 else
1276 data = (compact + 1);
1277 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001278 else
1279 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001280 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1281 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001282
Victor Stinnera849a4b2011-10-03 12:12:11 +02001283 if (ascii->wstr == data)
1284 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001285 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001286
Victor Stinnera3b334d2011-10-03 13:53:37 +02001287 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001288 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001289 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1290 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001291 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001292 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001293 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001294 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001295}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001296#endif
1297
1298PyObject *
1299PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1300{
1301 PyObject *obj;
1302 PyCompactUnicodeObject *unicode;
1303 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001304 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001305 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001306 Py_ssize_t char_size;
1307 Py_ssize_t struct_size;
1308
1309 /* Optimization for empty strings */
1310 if (size == 0 && unicode_empty != NULL) {
1311 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001312 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001313 }
1314
Victor Stinner9e9d6892011-10-04 01:02:02 +02001315 is_ascii = 0;
1316 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 struct_size = sizeof(PyCompactUnicodeObject);
1318 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001319 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001320 char_size = 1;
1321 is_ascii = 1;
1322 struct_size = sizeof(PyASCIIObject);
1323 }
1324 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001325 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001326 char_size = 1;
1327 }
1328 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001329 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001330 char_size = 2;
1331 if (sizeof(wchar_t) == 2)
1332 is_sharing = 1;
1333 }
1334 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001335 if (maxchar > MAX_UNICODE) {
1336 PyErr_SetString(PyExc_SystemError,
1337 "invalid maximum character passed to PyUnicode_New");
1338 return NULL;
1339 }
Victor Stinner8f825062012-04-27 13:55:39 +02001340 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001341 char_size = 4;
1342 if (sizeof(wchar_t) == 4)
1343 is_sharing = 1;
1344 }
1345
1346 /* Ensure we won't overflow the size. */
1347 if (size < 0) {
1348 PyErr_SetString(PyExc_SystemError,
1349 "Negative size passed to PyUnicode_New");
1350 return NULL;
1351 }
1352 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1353 return PyErr_NoMemory();
1354
1355 /* Duplicated allocation code from _PyObject_New() instead of a call to
1356 * PyObject_New() so we are able to allocate space for the object and
1357 * it's data buffer.
1358 */
1359 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1360 if (obj == NULL)
1361 return PyErr_NoMemory();
1362 obj = PyObject_INIT(obj, &PyUnicode_Type);
1363 if (obj == NULL)
1364 return NULL;
1365
1366 unicode = (PyCompactUnicodeObject *)obj;
1367 if (is_ascii)
1368 data = ((PyASCIIObject*)obj) + 1;
1369 else
1370 data = unicode + 1;
1371 _PyUnicode_LENGTH(unicode) = size;
1372 _PyUnicode_HASH(unicode) = -1;
1373 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001374 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 _PyUnicode_STATE(unicode).compact = 1;
1376 _PyUnicode_STATE(unicode).ready = 1;
1377 _PyUnicode_STATE(unicode).ascii = is_ascii;
1378 if (is_ascii) {
1379 ((char*)data)[size] = 0;
1380 _PyUnicode_WSTR(unicode) = NULL;
1381 }
Victor Stinner8f825062012-04-27 13:55:39 +02001382 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001383 ((char*)data)[size] = 0;
1384 _PyUnicode_WSTR(unicode) = NULL;
1385 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001386 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001387 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001388 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001389 else {
1390 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001391 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001392 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001393 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001394 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001395 ((Py_UCS4*)data)[size] = 0;
1396 if (is_sharing) {
1397 _PyUnicode_WSTR_LENGTH(unicode) = size;
1398 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1399 }
1400 else {
1401 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1402 _PyUnicode_WSTR(unicode) = NULL;
1403 }
1404 }
Victor Stinner8f825062012-04-27 13:55:39 +02001405#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001406 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001407#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001408 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001409 return obj;
1410}
1411
1412#if SIZEOF_WCHAR_T == 2
1413/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1414 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001415 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001416
1417 This function assumes that unicode can hold one more code point than wstr
1418 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001419static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001420unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001421 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001422{
1423 const wchar_t *iter;
1424 Py_UCS4 *ucs4_out;
1425
Victor Stinner910337b2011-10-03 03:20:16 +02001426 assert(unicode != NULL);
1427 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001428 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1429 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1430
1431 for (iter = begin; iter < end; ) {
1432 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1433 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001434 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1435 && (iter+1) < end
1436 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 {
Victor Stinner551ac952011-11-29 22:58:13 +01001438 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001439 iter += 2;
1440 }
1441 else {
1442 *ucs4_out++ = *iter;
1443 iter++;
1444 }
1445 }
1446 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1447 _PyUnicode_GET_LENGTH(unicode)));
1448
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001449}
1450#endif
1451
Victor Stinnercd9950f2011-10-02 00:34:53 +02001452static int
Victor Stinner488fa492011-12-12 00:01:39 +01001453unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001454{
Victor Stinner488fa492011-12-12 00:01:39 +01001455 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001456 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001457 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001458 return -1;
1459 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001460 return 0;
1461}
1462
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001463static int
1464_copy_characters(PyObject *to, Py_ssize_t to_start,
1465 PyObject *from, Py_ssize_t from_start,
1466 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001467{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001468 unsigned int from_kind, to_kind;
1469 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001470
Victor Stinneree4544c2012-05-09 22:24:08 +02001471 assert(0 <= how_many);
1472 assert(0 <= from_start);
1473 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001474 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001475 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001476 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001477
Victor Stinnerd3f08822012-05-29 12:57:52 +02001478 assert(PyUnicode_Check(to));
1479 assert(PyUnicode_IS_READY(to));
1480 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1481
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001482 if (how_many == 0)
1483 return 0;
1484
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001485 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001486 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001487 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001488 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001489
Victor Stinnerf1852262012-06-16 16:38:26 +02001490#ifdef Py_DEBUG
1491 if (!check_maxchar
1492 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1493 {
1494 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1495 Py_UCS4 ch;
1496 Py_ssize_t i;
1497 for (i=0; i < how_many; i++) {
1498 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1499 assert(ch <= to_maxchar);
1500 }
1501 }
1502#endif
1503
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001504 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001505 if (check_maxchar
1506 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1507 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001508 /* Writing Latin-1 characters into an ASCII string requires to
1509 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001510 Py_UCS4 max_char;
1511 max_char = ucs1lib_find_max_char(from_data,
1512 (Py_UCS1*)from_data + how_many);
1513 if (max_char >= 128)
1514 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001515 }
Christian Heimesf051e432016-09-13 20:22:02 +02001516 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001517 (char*)from_data + from_kind * from_start,
1518 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001519 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001520 else if (from_kind == PyUnicode_1BYTE_KIND
1521 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001522 {
1523 _PyUnicode_CONVERT_BYTES(
1524 Py_UCS1, Py_UCS2,
1525 PyUnicode_1BYTE_DATA(from) + from_start,
1526 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1527 PyUnicode_2BYTE_DATA(to) + to_start
1528 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001529 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001530 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001531 && to_kind == PyUnicode_4BYTE_KIND)
1532 {
1533 _PyUnicode_CONVERT_BYTES(
1534 Py_UCS1, Py_UCS4,
1535 PyUnicode_1BYTE_DATA(from) + from_start,
1536 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1537 PyUnicode_4BYTE_DATA(to) + to_start
1538 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001539 }
1540 else if (from_kind == PyUnicode_2BYTE_KIND
1541 && to_kind == PyUnicode_4BYTE_KIND)
1542 {
1543 _PyUnicode_CONVERT_BYTES(
1544 Py_UCS2, Py_UCS4,
1545 PyUnicode_2BYTE_DATA(from) + from_start,
1546 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1547 PyUnicode_4BYTE_DATA(to) + to_start
1548 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001549 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001550 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001551 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1552
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001553 if (!check_maxchar) {
1554 if (from_kind == PyUnicode_2BYTE_KIND
1555 && to_kind == PyUnicode_1BYTE_KIND)
1556 {
1557 _PyUnicode_CONVERT_BYTES(
1558 Py_UCS2, Py_UCS1,
1559 PyUnicode_2BYTE_DATA(from) + from_start,
1560 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1561 PyUnicode_1BYTE_DATA(to) + to_start
1562 );
1563 }
1564 else if (from_kind == PyUnicode_4BYTE_KIND
1565 && to_kind == PyUnicode_1BYTE_KIND)
1566 {
1567 _PyUnicode_CONVERT_BYTES(
1568 Py_UCS4, Py_UCS1,
1569 PyUnicode_4BYTE_DATA(from) + from_start,
1570 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1571 PyUnicode_1BYTE_DATA(to) + to_start
1572 );
1573 }
1574 else if (from_kind == PyUnicode_4BYTE_KIND
1575 && to_kind == PyUnicode_2BYTE_KIND)
1576 {
1577 _PyUnicode_CONVERT_BYTES(
1578 Py_UCS4, Py_UCS2,
1579 PyUnicode_4BYTE_DATA(from) + from_start,
1580 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1581 PyUnicode_2BYTE_DATA(to) + to_start
1582 );
1583 }
1584 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001585 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001586 }
1587 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001588 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001589 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001590 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001591 Py_ssize_t i;
1592
Victor Stinnera0702ab2011-09-29 14:14:38 +02001593 for (i=0; i < how_many; i++) {
1594 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001595 if (ch > to_maxchar)
1596 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001597 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1598 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001599 }
1600 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001601 return 0;
1602}
1603
Victor Stinnerd3f08822012-05-29 12:57:52 +02001604void
1605_PyUnicode_FastCopyCharacters(
1606 PyObject *to, Py_ssize_t to_start,
1607 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001608{
1609 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1610}
1611
1612Py_ssize_t
1613PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1614 PyObject *from, Py_ssize_t from_start,
1615 Py_ssize_t how_many)
1616{
1617 int err;
1618
1619 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1620 PyErr_BadInternalCall();
1621 return -1;
1622 }
1623
Benjamin Petersonbac79492012-01-14 13:34:47 -05001624 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001625 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001626 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001627 return -1;
1628
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001629 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001630 PyErr_SetString(PyExc_IndexError, "string index out of range");
1631 return -1;
1632 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001633 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001634 PyErr_SetString(PyExc_IndexError, "string index out of range");
1635 return -1;
1636 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001637 if (how_many < 0) {
1638 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1639 return -1;
1640 }
1641 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001642 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1643 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001644 "Cannot write %zi characters at %zi "
1645 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001646 how_many, to_start, PyUnicode_GET_LENGTH(to));
1647 return -1;
1648 }
1649
1650 if (how_many == 0)
1651 return 0;
1652
Victor Stinner488fa492011-12-12 00:01:39 +01001653 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001654 return -1;
1655
1656 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1657 if (err) {
1658 PyErr_Format(PyExc_SystemError,
1659 "Cannot copy %s characters "
1660 "into a string of %s characters",
1661 unicode_kind_name(from),
1662 unicode_kind_name(to));
1663 return -1;
1664 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001665 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001666}
1667
Victor Stinner17222162011-09-28 22:15:37 +02001668/* Find the maximum code point and count the number of surrogate pairs so a
1669 correct string length can be computed before converting a string to UCS4.
1670 This function counts single surrogates as a character and not as a pair.
1671
1672 Return 0 on success, or -1 on error. */
1673static int
1674find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1675 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001676{
1677 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001678 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001679
Victor Stinnerc53be962011-10-02 21:33:54 +02001680 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001681 *num_surrogates = 0;
1682 *maxchar = 0;
1683
1684 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001685#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001686 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1687 && (iter+1) < end
1688 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1689 {
1690 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1691 ++(*num_surrogates);
1692 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001693 }
1694 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001695#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001696 {
1697 ch = *iter;
1698 iter++;
1699 }
1700 if (ch > *maxchar) {
1701 *maxchar = ch;
1702 if (*maxchar > MAX_UNICODE) {
1703 PyErr_Format(PyExc_ValueError,
1704 "character U+%x is not in range [U+0000; U+10ffff]",
1705 ch);
1706 return -1;
1707 }
1708 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001709 }
1710 return 0;
1711}
1712
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001713int
1714_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001715{
1716 wchar_t *end;
1717 Py_UCS4 maxchar = 0;
1718 Py_ssize_t num_surrogates;
1719#if SIZEOF_WCHAR_T == 2
1720 Py_ssize_t length_wo_surrogates;
1721#endif
1722
Georg Brandl7597add2011-10-05 16:36:47 +02001723 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001724 strings were created using _PyObject_New() and where no canonical
1725 representation (the str field) has been set yet aka strings
1726 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001727 assert(_PyUnicode_CHECK(unicode));
1728 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001729 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001730 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001731 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001732 /* Actually, it should neither be interned nor be anything else: */
1733 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001735 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001736 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001737 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001739
1740 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001741 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1742 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 PyErr_NoMemory();
1744 return -1;
1745 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001746 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747 _PyUnicode_WSTR(unicode), end,
1748 PyUnicode_1BYTE_DATA(unicode));
1749 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1750 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1751 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1752 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001753 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001754 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001755 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001756 }
1757 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001758 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001759 _PyUnicode_UTF8(unicode) = NULL;
1760 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001761 }
1762 PyObject_FREE(_PyUnicode_WSTR(unicode));
1763 _PyUnicode_WSTR(unicode) = NULL;
1764 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1765 }
1766 /* In this case we might have to convert down from 4-byte native
1767 wchar_t to 2-byte unicode. */
1768 else if (maxchar < 65536) {
1769 assert(num_surrogates == 0 &&
1770 "FindMaxCharAndNumSurrogatePairs() messed up");
1771
Victor Stinner506f5922011-09-28 22:34:18 +02001772#if SIZEOF_WCHAR_T == 2
1773 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001774 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001775 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1776 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1777 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001778 _PyUnicode_UTF8(unicode) = NULL;
1779 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001780#else
1781 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001782 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001783 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001784 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001785 PyErr_NoMemory();
1786 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001787 }
Victor Stinner506f5922011-09-28 22:34:18 +02001788 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1789 _PyUnicode_WSTR(unicode), end,
1790 PyUnicode_2BYTE_DATA(unicode));
1791 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1792 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1793 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001794 _PyUnicode_UTF8(unicode) = NULL;
1795 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001796 PyObject_FREE(_PyUnicode_WSTR(unicode));
1797 _PyUnicode_WSTR(unicode) = NULL;
1798 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1799#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 }
1801 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1802 else {
1803#if SIZEOF_WCHAR_T == 2
1804 /* in case the native representation is 2-bytes, we need to allocate a
1805 new normalized 4-byte version. */
1806 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001807 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1808 PyErr_NoMemory();
1809 return -1;
1810 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001811 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1812 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001813 PyErr_NoMemory();
1814 return -1;
1815 }
1816 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1817 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001818 _PyUnicode_UTF8(unicode) = NULL;
1819 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001820 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1821 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001822 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 PyObject_FREE(_PyUnicode_WSTR(unicode));
1824 _PyUnicode_WSTR(unicode) = NULL;
1825 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1826#else
1827 assert(num_surrogates == 0);
1828
Victor Stinnerc3c74152011-10-02 20:39:55 +02001829 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001830 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001831 _PyUnicode_UTF8(unicode) = NULL;
1832 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001833 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1834#endif
1835 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1836 }
1837 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001838 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001839 return 0;
1840}
1841
Alexander Belopolsky40018472011-02-26 01:02:56 +00001842static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001843unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001844{
Walter Dörwald16807132007-05-25 13:52:07 +00001845 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001846 case SSTATE_NOT_INTERNED:
1847 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001848
Benjamin Peterson29060642009-01-31 22:14:21 +00001849 case SSTATE_INTERNED_MORTAL:
1850 /* revive dead object temporarily for DelItem */
1851 Py_REFCNT(unicode) = 3;
Victor Stinner7931d9a2011-11-04 00:22:48 +01001852 if (PyDict_DelItem(interned, unicode) != 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00001853 Py_FatalError(
1854 "deletion of interned string failed");
1855 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001856
Benjamin Peterson29060642009-01-31 22:14:21 +00001857 case SSTATE_INTERNED_IMMORTAL:
1858 Py_FatalError("Immortal interned string died.");
Stefan Krahf432a322017-08-21 13:09:59 +02001859 /* fall through */
Walter Dörwald16807132007-05-25 13:52:07 +00001860
Benjamin Peterson29060642009-01-31 22:14:21 +00001861 default:
1862 Py_FatalError("Inconsistent interned string state.");
Walter Dörwald16807132007-05-25 13:52:07 +00001863 }
1864
Victor Stinner03490912011-10-03 23:45:12 +02001865 if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001866 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinner829c0ad2011-10-03 01:08:02 +02001867 if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001868 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001869 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1870 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001871
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001872 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001873}
1874
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001875#ifdef Py_DEBUG
1876static int
1877unicode_is_singleton(PyObject *unicode)
1878{
1879 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1880 if (unicode == unicode_empty)
1881 return 1;
1882 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1883 {
1884 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1885 if (ch < 256 && unicode_latin1[ch] == unicode)
1886 return 1;
1887 }
1888 return 0;
1889}
1890#endif
1891
Alexander Belopolsky40018472011-02-26 01:02:56 +00001892static int
Victor Stinner488fa492011-12-12 00:01:39 +01001893unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001894{
Victor Stinner488fa492011-12-12 00:01:39 +01001895 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001896 if (Py_REFCNT(unicode) != 1)
1897 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001898 if (_PyUnicode_HASH(unicode) != -1)
1899 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001900 if (PyUnicode_CHECK_INTERNED(unicode))
1901 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001902 if (!PyUnicode_CheckExact(unicode))
1903 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001904#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001905 /* singleton refcount is greater than 1 */
1906 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001907#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001908 return 1;
1909}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001910
Victor Stinnerfe226c02011-10-03 03:52:20 +02001911static int
1912unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1913{
1914 PyObject *unicode;
1915 Py_ssize_t old_length;
1916
1917 assert(p_unicode != NULL);
1918 unicode = *p_unicode;
1919
1920 assert(unicode != NULL);
1921 assert(PyUnicode_Check(unicode));
1922 assert(0 <= length);
1923
Victor Stinner910337b2011-10-03 03:20:16 +02001924 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001925 old_length = PyUnicode_WSTR_LENGTH(unicode);
1926 else
1927 old_length = PyUnicode_GET_LENGTH(unicode);
1928 if (old_length == length)
1929 return 0;
1930
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001931 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001932 _Py_INCREF_UNICODE_EMPTY();
1933 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00001934 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001935 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001936 return 0;
1937 }
1938
Victor Stinner488fa492011-12-12 00:01:39 +01001939 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001940 PyObject *copy = resize_copy(unicode, length);
1941 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00001942 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03001943 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00001944 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001945 }
1946
Victor Stinnerfe226c02011-10-03 03:52:20 +02001947 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001948 PyObject *new_unicode = resize_compact(unicode, length);
1949 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001950 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001951 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001952 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04001953 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001954 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001955}
1956
Alexander Belopolsky40018472011-02-26 01:02:56 +00001957int
Victor Stinnerfe226c02011-10-03 03:52:20 +02001958PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001959{
Victor Stinnerfe226c02011-10-03 03:52:20 +02001960 PyObject *unicode;
1961 if (p_unicode == NULL) {
1962 PyErr_BadInternalCall();
1963 return -1;
1964 }
1965 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001966 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001967 {
1968 PyErr_BadInternalCall();
1969 return -1;
1970 }
1971 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00001972}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001973
Serhiy Storchakad65c9492015-11-02 14:10:23 +02001974/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01001975
Victor Stinnerb429d3b2012-02-22 21:22:20 +01001976 WARNING: The function doesn't copy the terminating null character and
1977 doesn't check the maximum character (may write a latin1 character in an
1978 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02001979static void
1980unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1981 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01001982{
1983 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1984 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02001985 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01001986
1987 switch (kind) {
1988 case PyUnicode_1BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01001989 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner8c6db452012-10-06 00:40:45 +02001990#ifdef Py_DEBUG
1991 if (PyUnicode_IS_ASCII(unicode)) {
1992 Py_UCS4 maxchar = ucs1lib_find_max_char(
1993 (const Py_UCS1*)str,
1994 (const Py_UCS1*)str + len);
1995 assert(maxchar < 128);
1996 }
1997#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01001998 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02001999 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002000 }
2001 case PyUnicode_2BYTE_KIND: {
2002 Py_UCS2 *start = (Py_UCS2 *)data + index;
2003 Py_UCS2 *ucs2 = start;
2004 assert(index <= PyUnicode_GET_LENGTH(unicode));
2005
Victor Stinner184252a2012-06-16 02:57:41 +02002006 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002007 *ucs2 = (Py_UCS2)*str;
2008
2009 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002010 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002011 }
2012 default: {
2013 Py_UCS4 *start = (Py_UCS4 *)data + index;
2014 Py_UCS4 *ucs4 = start;
2015 assert(kind == PyUnicode_4BYTE_KIND);
2016 assert(index <= PyUnicode_GET_LENGTH(unicode));
2017
Victor Stinner184252a2012-06-16 02:57:41 +02002018 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002019 *ucs4 = (Py_UCS4)*str;
2020
2021 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002022 }
2023 }
2024}
2025
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002026static PyObject*
2027get_latin1_char(unsigned char ch)
2028{
Victor Stinnera464fc12011-10-02 20:39:30 +02002029 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002030 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002031 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002032 if (!unicode)
2033 return NULL;
2034 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002035 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002036 unicode_latin1[ch] = unicode;
2037 }
2038 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002039 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002040}
2041
Victor Stinner985a82a2014-01-03 12:53:47 +01002042static PyObject*
2043unicode_char(Py_UCS4 ch)
2044{
2045 PyObject *unicode;
2046
2047 assert(ch <= MAX_UNICODE);
2048
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002049 if (ch < 256)
2050 return get_latin1_char(ch);
2051
Victor Stinner985a82a2014-01-03 12:53:47 +01002052 unicode = PyUnicode_New(1, ch);
2053 if (unicode == NULL)
2054 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002055
2056 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2057 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002058 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002059 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002060 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2061 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2062 }
2063 assert(_PyUnicode_CheckConsistency(unicode, 1));
2064 return unicode;
2065}
2066
Alexander Belopolsky40018472011-02-26 01:02:56 +00002067PyObject *
2068PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002069{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002070 if (u == NULL)
2071 return (PyObject*)_PyUnicode_New(size);
2072
2073 if (size < 0) {
2074 PyErr_BadInternalCall();
2075 return NULL;
2076 }
2077
2078 return PyUnicode_FromWideChar(u, size);
2079}
2080
2081PyObject *
2082PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2083{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002084 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002085 Py_UCS4 maxchar = 0;
2086 Py_ssize_t num_surrogates;
2087
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002088 if (u == NULL && size != 0) {
2089 PyErr_BadInternalCall();
2090 return NULL;
2091 }
2092
2093 if (size == -1) {
2094 size = wcslen(u);
2095 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002096
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002097 /* If the Unicode data is known at construction time, we can apply
2098 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002099
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002100 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002101 if (size == 0)
2102 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002103
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002104 /* Single character Unicode objects in the Latin-1 range are
2105 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002106 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002107 return get_latin1_char((unsigned char)*u);
2108
2109 /* If not empty and not single character, copy the Unicode data
2110 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002111 if (find_maxchar_surrogates(u, u + size,
2112 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002113 return NULL;
2114
Victor Stinner8faf8212011-12-08 22:14:11 +01002115 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002116 if (!unicode)
2117 return NULL;
2118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002119 switch (PyUnicode_KIND(unicode)) {
2120 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002121 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002122 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2123 break;
2124 case PyUnicode_2BYTE_KIND:
2125#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002126 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002127#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002128 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002129 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2130#endif
2131 break;
2132 case PyUnicode_4BYTE_KIND:
2133#if SIZEOF_WCHAR_T == 2
2134 /* This is the only case which has to process surrogates, thus
2135 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002136 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002137#else
2138 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002139 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002140#endif
2141 break;
2142 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002143 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002144 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002145
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002146 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002147}
2148
Alexander Belopolsky40018472011-02-26 01:02:56 +00002149PyObject *
2150PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002151{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002152 if (size < 0) {
2153 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002154 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002155 return NULL;
2156 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002157 if (u != NULL)
2158 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2159 else
2160 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002161}
2162
Alexander Belopolsky40018472011-02-26 01:02:56 +00002163PyObject *
2164PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002165{
2166 size_t size = strlen(u);
2167 if (size > PY_SSIZE_T_MAX) {
2168 PyErr_SetString(PyExc_OverflowError, "input too long");
2169 return NULL;
2170 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002171 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002172}
2173
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002174PyObject *
2175_PyUnicode_FromId(_Py_Identifier *id)
2176{
2177 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002178 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2179 strlen(id->string),
2180 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002181 if (!id->object)
2182 return NULL;
2183 PyUnicode_InternInPlace(&id->object);
2184 assert(!id->next);
2185 id->next = static_strings;
2186 static_strings = id;
2187 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002188 return id->object;
2189}
2190
2191void
2192_PyUnicode_ClearStaticStrings()
2193{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002194 _Py_Identifier *tmp, *s = static_strings;
2195 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002196 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002197 tmp = s->next;
2198 s->next = NULL;
2199 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002200 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002201 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002202}
2203
Benjamin Peterson0df54292012-03-26 14:50:32 -04002204/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002205
Victor Stinnerd3f08822012-05-29 12:57:52 +02002206PyObject*
2207_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002208{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002209 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002210 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002211 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002212#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002213 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002214#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002215 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002216 }
Victor Stinner785938e2011-12-11 20:09:03 +01002217 unicode = PyUnicode_New(size, 127);
2218 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002219 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002220 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2221 assert(_PyUnicode_CheckConsistency(unicode, 1));
2222 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002223}
2224
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002225static Py_UCS4
2226kind_maxchar_limit(unsigned int kind)
2227{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002228 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002229 case PyUnicode_1BYTE_KIND:
2230 return 0x80;
2231 case PyUnicode_2BYTE_KIND:
2232 return 0x100;
2233 case PyUnicode_4BYTE_KIND:
2234 return 0x10000;
2235 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002236 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002237 }
2238}
2239
Victor Stinner702c7342011-10-05 13:50:52 +02002240static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002241_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002242{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002243 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002244 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002245
Serhiy Storchaka678db842013-01-26 12:16:36 +02002246 if (size == 0)
2247 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002248 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002249 if (size == 1)
2250 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002251
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002252 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002253 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002254 if (!res)
2255 return NULL;
2256 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002257 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002258 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002259}
2260
Victor Stinnere57b1c02011-09-28 22:20:48 +02002261static PyObject*
2262_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002263{
2264 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002265 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002266
Serhiy Storchaka678db842013-01-26 12:16:36 +02002267 if (size == 0)
2268 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002269 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002270 if (size == 1)
2271 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002272
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002273 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002274 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002275 if (!res)
2276 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002277 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002278 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002279 else {
2280 _PyUnicode_CONVERT_BYTES(
2281 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2282 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002283 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002284 return res;
2285}
2286
Victor Stinnere57b1c02011-09-28 22:20:48 +02002287static PyObject*
2288_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002289{
2290 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002291 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002292
Serhiy Storchaka678db842013-01-26 12:16:36 +02002293 if (size == 0)
2294 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002295 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002296 if (size == 1)
2297 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002298
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002299 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002300 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002301 if (!res)
2302 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002303 if (max_char < 256)
2304 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2305 PyUnicode_1BYTE_DATA(res));
2306 else if (max_char < 0x10000)
2307 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2308 PyUnicode_2BYTE_DATA(res));
2309 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002310 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002311 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002312 return res;
2313}
2314
2315PyObject*
2316PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2317{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002318 if (size < 0) {
2319 PyErr_SetString(PyExc_ValueError, "size must be positive");
2320 return NULL;
2321 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002322 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002323 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002324 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002325 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002326 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002327 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002328 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002329 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002330 PyErr_SetString(PyExc_SystemError, "invalid kind");
2331 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002332 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002333}
2334
Victor Stinnerece58de2012-04-23 23:36:38 +02002335Py_UCS4
2336_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2337{
2338 enum PyUnicode_Kind kind;
2339 void *startptr, *endptr;
2340
2341 assert(PyUnicode_IS_READY(unicode));
2342 assert(0 <= start);
2343 assert(end <= PyUnicode_GET_LENGTH(unicode));
2344 assert(start <= end);
2345
2346 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2347 return PyUnicode_MAX_CHAR_VALUE(unicode);
2348
2349 if (start == end)
2350 return 127;
2351
Victor Stinner94d558b2012-04-27 22:26:58 +02002352 if (PyUnicode_IS_ASCII(unicode))
2353 return 127;
2354
Victor Stinnerece58de2012-04-23 23:36:38 +02002355 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002356 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002357 endptr = (char *)startptr + end * kind;
2358 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002359 switch(kind) {
2360 case PyUnicode_1BYTE_KIND:
2361 return ucs1lib_find_max_char(startptr, endptr);
2362 case PyUnicode_2BYTE_KIND:
2363 return ucs2lib_find_max_char(startptr, endptr);
2364 case PyUnicode_4BYTE_KIND:
2365 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002366 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002367 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002368 }
2369}
2370
Victor Stinner25a4b292011-10-06 12:31:55 +02002371/* Ensure that a string uses the most efficient storage, if it is not the
2372 case: create a new string with of the right kind. Write NULL into *p_unicode
2373 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002374static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002375unicode_adjust_maxchar(PyObject **p_unicode)
2376{
2377 PyObject *unicode, *copy;
2378 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002379 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002380 unsigned int kind;
2381
2382 assert(p_unicode != NULL);
2383 unicode = *p_unicode;
2384 assert(PyUnicode_IS_READY(unicode));
2385 if (PyUnicode_IS_ASCII(unicode))
2386 return;
2387
2388 len = PyUnicode_GET_LENGTH(unicode);
2389 kind = PyUnicode_KIND(unicode);
2390 if (kind == PyUnicode_1BYTE_KIND) {
2391 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002392 max_char = ucs1lib_find_max_char(u, u + len);
2393 if (max_char >= 128)
2394 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002395 }
2396 else if (kind == PyUnicode_2BYTE_KIND) {
2397 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002398 max_char = ucs2lib_find_max_char(u, u + len);
2399 if (max_char >= 256)
2400 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002401 }
2402 else {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002403 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Victor Stinner25a4b292011-10-06 12:31:55 +02002404 assert(kind == PyUnicode_4BYTE_KIND);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002405 max_char = ucs4lib_find_max_char(u, u + len);
2406 if (max_char >= 0x10000)
2407 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002408 }
Victor Stinner25a4b292011-10-06 12:31:55 +02002409 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002410 if (copy != NULL)
2411 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002412 Py_DECREF(unicode);
2413 *p_unicode = copy;
2414}
2415
Victor Stinner034f6cf2011-09-30 02:26:44 +02002416PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002417_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002418{
Victor Stinner87af4f22011-11-21 23:03:47 +01002419 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002420 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002421
Victor Stinner034f6cf2011-09-30 02:26:44 +02002422 if (!PyUnicode_Check(unicode)) {
2423 PyErr_BadInternalCall();
2424 return NULL;
2425 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002426 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002427 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002428
Victor Stinner87af4f22011-11-21 23:03:47 +01002429 length = PyUnicode_GET_LENGTH(unicode);
2430 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002431 if (!copy)
2432 return NULL;
2433 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2434
Christian Heimesf051e432016-09-13 20:22:02 +02002435 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002436 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002437 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002438 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002439}
2440
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002441
Victor Stinnerbc603d12011-10-02 01:00:40 +02002442/* Widen Unicode objects to larger buffers. Don't write terminating null
2443 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002444
2445void*
2446_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2447{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002448 Py_ssize_t len;
2449 void *result;
2450 unsigned int skind;
2451
Benjamin Petersonbac79492012-01-14 13:34:47 -05002452 if (PyUnicode_READY(s) == -1)
Victor Stinnerbc603d12011-10-02 01:00:40 +02002453 return NULL;
2454
2455 len = PyUnicode_GET_LENGTH(s);
2456 skind = PyUnicode_KIND(s);
2457 if (skind >= kind) {
Victor Stinner01698042011-10-04 00:04:26 +02002458 PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002459 return NULL;
2460 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002461 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002462 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002463 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002464 if (!result)
2465 return PyErr_NoMemory();
2466 assert(skind == PyUnicode_1BYTE_KIND);
2467 _PyUnicode_CONVERT_BYTES(
2468 Py_UCS1, Py_UCS2,
2469 PyUnicode_1BYTE_DATA(s),
2470 PyUnicode_1BYTE_DATA(s) + len,
2471 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002472 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002473 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002474 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002475 if (!result)
2476 return PyErr_NoMemory();
2477 if (skind == PyUnicode_2BYTE_KIND) {
2478 _PyUnicode_CONVERT_BYTES(
2479 Py_UCS2, Py_UCS4,
2480 PyUnicode_2BYTE_DATA(s),
2481 PyUnicode_2BYTE_DATA(s) + len,
2482 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002483 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002484 else {
2485 assert(skind == PyUnicode_1BYTE_KIND);
2486 _PyUnicode_CONVERT_BYTES(
2487 Py_UCS1, Py_UCS4,
2488 PyUnicode_1BYTE_DATA(s),
2489 PyUnicode_1BYTE_DATA(s) + len,
2490 result);
2491 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002492 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002493 default:
2494 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002495 }
Victor Stinner01698042011-10-04 00:04:26 +02002496 PyErr_SetString(PyExc_SystemError, "invalid kind");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002497 return NULL;
2498}
2499
2500static Py_UCS4*
2501as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2502 int copy_null)
2503{
2504 int kind;
2505 void *data;
2506 Py_ssize_t len, targetlen;
2507 if (PyUnicode_READY(string) == -1)
2508 return NULL;
2509 kind = PyUnicode_KIND(string);
2510 data = PyUnicode_DATA(string);
2511 len = PyUnicode_GET_LENGTH(string);
2512 targetlen = len;
2513 if (copy_null)
2514 targetlen++;
2515 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002516 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002517 if (!target) {
2518 PyErr_NoMemory();
2519 return NULL;
2520 }
2521 }
2522 else {
2523 if (targetsize < targetlen) {
2524 PyErr_Format(PyExc_SystemError,
2525 "string is longer than the buffer");
2526 if (copy_null && 0 < targetsize)
2527 target[0] = 0;
2528 return NULL;
2529 }
2530 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002531 if (kind == PyUnicode_1BYTE_KIND) {
2532 Py_UCS1 *start = (Py_UCS1 *) data;
2533 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002534 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002535 else if (kind == PyUnicode_2BYTE_KIND) {
2536 Py_UCS2 *start = (Py_UCS2 *) data;
2537 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2538 }
2539 else {
2540 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002541 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002542 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002543 if (copy_null)
2544 target[len] = 0;
2545 return target;
2546}
2547
2548Py_UCS4*
2549PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2550 int copy_null)
2551{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002552 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002553 PyErr_BadInternalCall();
2554 return NULL;
2555 }
2556 return as_ucs4(string, target, targetsize, copy_null);
2557}
2558
2559Py_UCS4*
2560PyUnicode_AsUCS4Copy(PyObject *string)
2561{
2562 return as_ucs4(string, NULL, 0, 1);
2563}
2564
Victor Stinner15a11362012-10-06 23:48:20 +02002565/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002566 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2567 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2568#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002569
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002570static int
2571unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2572 Py_ssize_t width, Py_ssize_t precision)
2573{
2574 Py_ssize_t length, fill, arglen;
2575 Py_UCS4 maxchar;
2576
2577 if (PyUnicode_READY(str) == -1)
2578 return -1;
2579
2580 length = PyUnicode_GET_LENGTH(str);
2581 if ((precision == -1 || precision >= length)
2582 && width <= length)
2583 return _PyUnicodeWriter_WriteStr(writer, str);
2584
2585 if (precision != -1)
2586 length = Py_MIN(precision, length);
2587
2588 arglen = Py_MAX(length, width);
2589 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2590 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2591 else
2592 maxchar = writer->maxchar;
2593
2594 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2595 return -1;
2596
2597 if (width > length) {
2598 fill = width - length;
2599 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2600 return -1;
2601 writer->pos += fill;
2602 }
2603
2604 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2605 str, 0, length);
2606 writer->pos += length;
2607 return 0;
2608}
2609
2610static int
Victor Stinner998b8062018-09-12 00:23:25 +02002611unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002612 Py_ssize_t width, Py_ssize_t precision)
2613{
2614 /* UTF-8 */
2615 Py_ssize_t length;
2616 PyObject *unicode;
2617 int res;
2618
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002619 if (precision == -1) {
2620 length = strlen(str);
2621 }
2622 else {
2623 length = 0;
2624 while (length < precision && str[length]) {
2625 length++;
2626 }
2627 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002628 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2629 if (unicode == NULL)
2630 return -1;
2631
2632 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2633 Py_DECREF(unicode);
2634 return res;
2635}
2636
Victor Stinner96865452011-03-01 23:44:09 +00002637static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002638unicode_fromformat_arg(_PyUnicodeWriter *writer,
2639 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002640{
Victor Stinnere215d962012-10-06 23:03:36 +02002641 const char *p;
2642 Py_ssize_t len;
2643 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002644 Py_ssize_t width;
2645 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002646 int longflag;
2647 int longlongflag;
2648 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002649 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002650
2651 p = f;
2652 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002653 zeropad = 0;
2654 if (*f == '0') {
2655 zeropad = 1;
2656 f++;
2657 }
Victor Stinner96865452011-03-01 23:44:09 +00002658
2659 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002660 width = -1;
2661 if (Py_ISDIGIT((unsigned)*f)) {
2662 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002663 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002664 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002665 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002666 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002667 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002668 return NULL;
2669 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002670 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002671 f++;
2672 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002673 }
2674 precision = -1;
2675 if (*f == '.') {
2676 f++;
2677 if (Py_ISDIGIT((unsigned)*f)) {
2678 precision = (*f - '0');
2679 f++;
2680 while (Py_ISDIGIT((unsigned)*f)) {
2681 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2682 PyErr_SetString(PyExc_ValueError,
2683 "precision too big");
2684 return NULL;
2685 }
2686 precision = (precision * 10) + (*f - '0');
2687 f++;
2688 }
2689 }
Victor Stinner96865452011-03-01 23:44:09 +00002690 if (*f == '%') {
2691 /* "%.3%s" => f points to "3" */
2692 f--;
2693 }
2694 }
2695 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002696 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002697 f--;
2698 }
Victor Stinner96865452011-03-01 23:44:09 +00002699
2700 /* Handle %ld, %lu, %lld and %llu. */
2701 longflag = 0;
2702 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002703 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002704 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002705 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002706 longflag = 1;
2707 ++f;
2708 }
Victor Stinner96865452011-03-01 23:44:09 +00002709 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002710 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002711 longlongflag = 1;
2712 f += 2;
2713 }
Victor Stinner96865452011-03-01 23:44:09 +00002714 }
2715 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002716 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002717 size_tflag = 1;
2718 ++f;
2719 }
Victor Stinnere215d962012-10-06 23:03:36 +02002720
2721 if (f[1] == '\0')
2722 writer->overallocate = 0;
2723
2724 switch (*f) {
2725 case 'c':
2726 {
2727 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002728 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002729 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002730 "character argument not in range(0x110000)");
2731 return NULL;
2732 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002733 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002734 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002735 break;
2736 }
2737
2738 case 'i':
2739 case 'd':
2740 case 'u':
2741 case 'x':
2742 {
2743 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002744 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002745 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002746
2747 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002748 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002749 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002750 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002751 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002752 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002753 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002754 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002755 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002756 va_arg(*vargs, size_t));
2757 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002758 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002759 va_arg(*vargs, unsigned int));
2760 }
2761 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002762 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002763 }
2764 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002765 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002766 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002767 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002768 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002769 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002770 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002771 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002772 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002773 va_arg(*vargs, Py_ssize_t));
2774 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002775 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002776 va_arg(*vargs, int));
2777 }
2778 assert(len >= 0);
2779
Victor Stinnere215d962012-10-06 23:03:36 +02002780 if (precision < len)
2781 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002782
2783 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002784 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2785 return NULL;
2786
Victor Stinnere215d962012-10-06 23:03:36 +02002787 if (width > precision) {
2788 Py_UCS4 fillchar;
2789 fill = width - precision;
2790 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002791 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2792 return NULL;
2793 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002794 }
Victor Stinner15a11362012-10-06 23:48:20 +02002795 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002796 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002797 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2798 return NULL;
2799 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002800 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002801
Victor Stinner4a587072013-11-19 12:54:53 +01002802 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2803 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002804 break;
2805 }
2806
2807 case 'p':
2808 {
2809 char number[MAX_LONG_LONG_CHARS];
2810
2811 len = sprintf(number, "%p", va_arg(*vargs, void*));
2812 assert(len >= 0);
2813
2814 /* %p is ill-defined: ensure leading 0x. */
2815 if (number[1] == 'X')
2816 number[1] = 'x';
2817 else if (number[1] != 'x') {
2818 memmove(number + 2, number,
2819 strlen(number) + 1);
2820 number[0] = '0';
2821 number[1] = 'x';
2822 len += 2;
2823 }
2824
Victor Stinner4a587072013-11-19 12:54:53 +01002825 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002826 return NULL;
2827 break;
2828 }
2829
2830 case 's':
2831 {
2832 /* UTF-8 */
2833 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002834 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002835 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002836 break;
2837 }
2838
2839 case 'U':
2840 {
2841 PyObject *obj = va_arg(*vargs, PyObject *);
2842 assert(obj && _PyUnicode_CHECK(obj));
2843
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002844 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002845 return NULL;
2846 break;
2847 }
2848
2849 case 'V':
2850 {
2851 PyObject *obj = va_arg(*vargs, PyObject *);
2852 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002853 if (obj) {
2854 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002855 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002856 return NULL;
2857 }
2858 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002859 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002860 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002861 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002862 }
2863 break;
2864 }
2865
2866 case 'S':
2867 {
2868 PyObject *obj = va_arg(*vargs, PyObject *);
2869 PyObject *str;
2870 assert(obj);
2871 str = PyObject_Str(obj);
2872 if (!str)
2873 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002874 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002875 Py_DECREF(str);
2876 return NULL;
2877 }
2878 Py_DECREF(str);
2879 break;
2880 }
2881
2882 case 'R':
2883 {
2884 PyObject *obj = va_arg(*vargs, PyObject *);
2885 PyObject *repr;
2886 assert(obj);
2887 repr = PyObject_Repr(obj);
2888 if (!repr)
2889 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002890 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002891 Py_DECREF(repr);
2892 return NULL;
2893 }
2894 Py_DECREF(repr);
2895 break;
2896 }
2897
2898 case 'A':
2899 {
2900 PyObject *obj = va_arg(*vargs, PyObject *);
2901 PyObject *ascii;
2902 assert(obj);
2903 ascii = PyObject_ASCII(obj);
2904 if (!ascii)
2905 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002906 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002907 Py_DECREF(ascii);
2908 return NULL;
2909 }
2910 Py_DECREF(ascii);
2911 break;
2912 }
2913
2914 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002915 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002916 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002917 break;
2918
2919 default:
2920 /* if we stumble upon an unknown formatting code, copy the rest
2921 of the format string to the output string. (we cannot just
2922 skip the code, since there's no way to know what's in the
2923 argument list) */
2924 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002925 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002926 return NULL;
2927 f = p+len;
2928 return f;
2929 }
2930
2931 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002932 return f;
2933}
2934
Walter Dörwaldd2034312007-05-18 16:29:38 +00002935PyObject *
2936PyUnicode_FromFormatV(const char *format, va_list vargs)
2937{
Victor Stinnere215d962012-10-06 23:03:36 +02002938 va_list vargs2;
2939 const char *f;
2940 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002941
Victor Stinner8f674cc2013-04-17 23:02:17 +02002942 _PyUnicodeWriter_Init(&writer);
2943 writer.min_length = strlen(format) + 100;
2944 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02002945
Benjamin Peterson0c212142016-09-20 20:39:33 -07002946 // Copy varags to be able to pass a reference to a subfunction.
2947 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02002948
2949 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00002950 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02002951 f = unicode_fromformat_arg(&writer, f, &vargs2);
2952 if (f == NULL)
2953 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00002954 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002955 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002956 const char *p;
2957 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002958
Victor Stinnere215d962012-10-06 23:03:36 +02002959 p = f;
2960 do
2961 {
2962 if ((unsigned char)*p > 127) {
2963 PyErr_Format(PyExc_ValueError,
2964 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2965 "string, got a non-ASCII byte: 0x%02x",
2966 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02002967 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002968 }
2969 p++;
2970 }
2971 while (*p != '\0' && *p != '%');
2972 len = p - f;
2973
2974 if (*p == '\0')
2975 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01002976
2977 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002978 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02002979
2980 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00002981 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00002982 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02002983 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002984 return _PyUnicodeWriter_Finish(&writer);
2985
2986 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02002987 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02002988 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00002989 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002990}
2991
Walter Dörwaldd2034312007-05-18 16:29:38 +00002992PyObject *
2993PyUnicode_FromFormat(const char *format, ...)
2994{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002995 PyObject* ret;
2996 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002997
2998#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00002999 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003000#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003001 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003002#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003003 ret = PyUnicode_FromFormatV(format, vargs);
3004 va_end(vargs);
3005 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003006}
3007
Serhiy Storchakac46db922018-10-23 22:58:24 +03003008static Py_ssize_t
3009unicode_get_widechar_size(PyObject *unicode)
3010{
3011 Py_ssize_t res;
3012
3013 assert(unicode != NULL);
3014 assert(_PyUnicode_CHECK(unicode));
3015
3016 if (_PyUnicode_WSTR(unicode) != NULL) {
3017 return PyUnicode_WSTR_LENGTH(unicode);
3018 }
3019 assert(PyUnicode_IS_READY(unicode));
3020
3021 res = _PyUnicode_LENGTH(unicode);
3022#if SIZEOF_WCHAR_T == 2
3023 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3024 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3025 const Py_UCS4 *end = s + res;
3026 for (; s < end; ++s) {
3027 if (*s > 0xFFFF) {
3028 ++res;
3029 }
3030 }
3031 }
3032#endif
3033 return res;
3034}
3035
3036static void
3037unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3038{
3039 const wchar_t *wstr;
3040
3041 assert(unicode != NULL);
3042 assert(_PyUnicode_CHECK(unicode));
3043
3044 wstr = _PyUnicode_WSTR(unicode);
3045 if (wstr != NULL) {
3046 memcpy(w, wstr, size * sizeof(wchar_t));
3047 return;
3048 }
3049 assert(PyUnicode_IS_READY(unicode));
3050
3051 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3052 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3053 for (; size--; ++s, ++w) {
3054 *w = *s;
3055 }
3056 }
3057 else {
3058#if SIZEOF_WCHAR_T == 4
3059 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3060 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3061 for (; size--; ++s, ++w) {
3062 *w = *s;
3063 }
3064#else
3065 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3066 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3067 for (; size--; ++s, ++w) {
3068 Py_UCS4 ch = *s;
3069 if (ch > 0xFFFF) {
3070 assert(ch <= MAX_UNICODE);
3071 /* encode surrogate pair in this case */
3072 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3073 if (!size--)
3074 break;
3075 *w = Py_UNICODE_LOW_SURROGATE(ch);
3076 }
3077 else {
3078 *w = ch;
3079 }
3080 }
3081#endif
3082 }
3083}
3084
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003085#ifdef HAVE_WCHAR_H
3086
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003087/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003088
Victor Stinnerd88d9832011-09-06 02:00:05 +02003089 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003090 character) required to convert the unicode object. Ignore size argument.
3091
Victor Stinnerd88d9832011-09-06 02:00:05 +02003092 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003093 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003094 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003095Py_ssize_t
3096PyUnicode_AsWideChar(PyObject *unicode,
3097 wchar_t *w,
3098 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003099{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003100 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003101
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003102 if (unicode == NULL) {
3103 PyErr_BadInternalCall();
3104 return -1;
3105 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003106 if (!PyUnicode_Check(unicode)) {
3107 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003108 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003109 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003110
3111 res = unicode_get_widechar_size(unicode);
3112 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003113 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003114 }
3115
3116 if (size > res) {
3117 size = res + 1;
3118 }
3119 else {
3120 res = size;
3121 }
3122 unicode_copy_as_widechar(unicode, w, size);
3123 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003124}
3125
Victor Stinner137c34c2010-09-29 10:25:54 +00003126wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003127PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003128 Py_ssize_t *size)
3129{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003130 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003131 Py_ssize_t buflen;
3132
3133 if (unicode == NULL) {
3134 PyErr_BadInternalCall();
3135 return NULL;
3136 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003137 if (!PyUnicode_Check(unicode)) {
3138 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003139 return NULL;
3140 }
3141
Serhiy Storchakac46db922018-10-23 22:58:24 +03003142 buflen = unicode_get_widechar_size(unicode);
3143 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003144 if (buffer == NULL) {
3145 PyErr_NoMemory();
3146 return NULL;
3147 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003148 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3149 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003151 }
3152 else if (wcslen(buffer) != (size_t)buflen) {
3153 PyMem_FREE(buffer);
3154 PyErr_SetString(PyExc_ValueError,
3155 "embedded null character");
3156 return NULL;
3157 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003158 return buffer;
3159}
3160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003161#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003162
Alexander Belopolsky40018472011-02-26 01:02:56 +00003163PyObject *
3164PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003165{
Victor Stinner8faf8212011-12-08 22:14:11 +01003166 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003167 PyErr_SetString(PyExc_ValueError,
3168 "chr() arg not in range(0x110000)");
3169 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003170 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003171
Victor Stinner985a82a2014-01-03 12:53:47 +01003172 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003173}
3174
Alexander Belopolsky40018472011-02-26 01:02:56 +00003175PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003176PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003177{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003178 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003179 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003180 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003181 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003182 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003183 Py_INCREF(obj);
3184 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003185 }
3186 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003187 /* For a Unicode subtype that's not a Unicode object,
3188 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003189 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003190 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003191 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003192 "Can't convert '%.100s' object to str implicitly",
3193 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003194 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003195}
3196
Alexander Belopolsky40018472011-02-26 01:02:56 +00003197PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003198PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003199 const char *encoding,
3200 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003201{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003202 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003203 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003204
Guido van Rossumd57fd912000-03-10 22:53:23 +00003205 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003206 PyErr_BadInternalCall();
3207 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003208 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003209
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003210 /* Decoding bytes objects is the most common case and should be fast */
3211 if (PyBytes_Check(obj)) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003212 if (PyBytes_GET_SIZE(obj) == 0)
3213 _Py_RETURN_UNICODE_EMPTY();
3214 v = PyUnicode_Decode(
3215 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3216 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003217 return v;
3218 }
3219
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003220 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003221 PyErr_SetString(PyExc_TypeError,
3222 "decoding str is not supported");
3223 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003224 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003225
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003226 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3227 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3228 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003229 "decoding to str: need a bytes-like object, %.80s found",
3230 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003231 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003232 }
Tim Petersced69f82003-09-16 20:30:58 +00003233
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003234 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003235 PyBuffer_Release(&buffer);
3236 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003237 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003238
Serhiy Storchaka05997252013-01-26 12:14:02 +02003239 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003240 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003241 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003242}
3243
Victor Stinnerebe17e02016-10-12 13:57:45 +02003244/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3245 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3246 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003247int
3248_Py_normalize_encoding(const char *encoding,
3249 char *lower,
3250 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003251{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003252 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003253 char *l;
3254 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003255 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003256
Victor Stinner942889a2016-09-05 15:40:10 -07003257 assert(encoding != NULL);
3258
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003259 e = encoding;
3260 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003261 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003262 punct = 0;
3263 while (1) {
3264 char c = *e;
3265 if (c == 0) {
3266 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003267 }
Victor Stinner942889a2016-09-05 15:40:10 -07003268
3269 if (Py_ISALNUM(c) || c == '.') {
3270 if (punct && l != lower) {
3271 if (l == l_end) {
3272 return 0;
3273 }
3274 *l++ = '_';
3275 }
3276 punct = 0;
3277
3278 if (l == l_end) {
3279 return 0;
3280 }
3281 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003282 }
3283 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003284 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003285 }
Victor Stinner942889a2016-09-05 15:40:10 -07003286
3287 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003288 }
3289 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003290 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003291}
3292
Alexander Belopolsky40018472011-02-26 01:02:56 +00003293PyObject *
3294PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003295 Py_ssize_t size,
3296 const char *encoding,
3297 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003298{
3299 PyObject *buffer = NULL, *unicode;
3300 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003301 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3302
3303 if (encoding == NULL) {
3304 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3305 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003306
Fred Drakee4315f52000-05-09 19:53:39 +00003307 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003308 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3309 char *lower = buflower;
3310
3311 /* Fast paths */
3312 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3313 lower += 3;
3314 if (*lower == '_') {
3315 /* Match "utf8" and "utf_8" */
3316 lower++;
3317 }
3318
3319 if (lower[0] == '8' && lower[1] == 0) {
3320 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3321 }
3322 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3323 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3324 }
3325 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3326 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3327 }
3328 }
3329 else {
3330 if (strcmp(lower, "ascii") == 0
3331 || strcmp(lower, "us_ascii") == 0) {
3332 return PyUnicode_DecodeASCII(s, size, errors);
3333 }
Steve Dowercc16be82016-09-08 10:35:16 -07003334 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003335 else if (strcmp(lower, "mbcs") == 0) {
3336 return PyUnicode_DecodeMBCS(s, size, errors);
3337 }
3338 #endif
3339 else if (strcmp(lower, "latin1") == 0
3340 || strcmp(lower, "latin_1") == 0
3341 || strcmp(lower, "iso_8859_1") == 0
3342 || strcmp(lower, "iso8859_1") == 0) {
3343 return PyUnicode_DecodeLatin1(s, size, errors);
3344 }
3345 }
Victor Stinner37296e82010-06-10 13:36:23 +00003346 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003347
3348 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003349 buffer = NULL;
Antoine Pitrouc3b39242009-01-03 16:59:18 +00003350 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003351 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003352 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003353 if (buffer == NULL)
3354 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003355 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003356 if (unicode == NULL)
3357 goto onError;
3358 if (!PyUnicode_Check(unicode)) {
3359 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003360 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003361 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003362 encoding,
3363 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003364 Py_DECREF(unicode);
3365 goto onError;
3366 }
3367 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003368 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003369
Benjamin Peterson29060642009-01-31 22:14:21 +00003370 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003371 Py_XDECREF(buffer);
3372 return NULL;
3373}
3374
Alexander Belopolsky40018472011-02-26 01:02:56 +00003375PyObject *
3376PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003377 const char *encoding,
3378 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003379{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003380 if (!PyUnicode_Check(unicode)) {
3381 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003382 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003383 }
3384
Serhiy Storchaka00939072016-10-27 21:05:49 +03003385 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3386 "PyUnicode_AsDecodedObject() is deprecated; "
3387 "use PyCodec_Decode() to decode from str", 1) < 0)
3388 return NULL;
3389
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003390 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003391 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003392
3393 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003394 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003395}
3396
Alexander Belopolsky40018472011-02-26 01:02:56 +00003397PyObject *
3398PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003399 const char *encoding,
3400 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003401{
3402 PyObject *v;
3403
3404 if (!PyUnicode_Check(unicode)) {
3405 PyErr_BadArgument();
3406 goto onError;
3407 }
3408
Serhiy Storchaka00939072016-10-27 21:05:49 +03003409 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3410 "PyUnicode_AsDecodedUnicode() is deprecated; "
3411 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3412 return NULL;
3413
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003414 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003415 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003416
3417 /* Decode via the codec registry */
3418 v = PyCodec_Decode(unicode, encoding, errors);
3419 if (v == NULL)
3420 goto onError;
3421 if (!PyUnicode_Check(v)) {
3422 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003423 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003424 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003425 encoding,
3426 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003427 Py_DECREF(v);
3428 goto onError;
3429 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003430 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003431
Benjamin Peterson29060642009-01-31 22:14:21 +00003432 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003433 return NULL;
3434}
3435
Alexander Belopolsky40018472011-02-26 01:02:56 +00003436PyObject *
3437PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003438 Py_ssize_t size,
3439 const char *encoding,
3440 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003441{
3442 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003443
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003444 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003445 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003446 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003447 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3448 Py_DECREF(unicode);
3449 return v;
3450}
3451
Alexander Belopolsky40018472011-02-26 01:02:56 +00003452PyObject *
3453PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003454 const char *encoding,
3455 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003456{
3457 PyObject *v;
3458
3459 if (!PyUnicode_Check(unicode)) {
3460 PyErr_BadArgument();
3461 goto onError;
3462 }
3463
Serhiy Storchaka00939072016-10-27 21:05:49 +03003464 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3465 "PyUnicode_AsEncodedObject() is deprecated; "
3466 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3467 "or PyCodec_Encode() for generic encoding", 1) < 0)
3468 return NULL;
3469
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003470 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003471 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003472
3473 /* Encode via the codec registry */
3474 v = PyCodec_Encode(unicode, encoding, errors);
3475 if (v == NULL)
3476 goto onError;
3477 return v;
3478
Benjamin Peterson29060642009-01-31 22:14:21 +00003479 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003480 return NULL;
3481}
3482
Victor Stinner1b579672011-12-17 05:47:23 +01003483
Victor Stinner2cba6b82018-01-10 22:46:15 +01003484static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003485unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003486 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003487{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003488 Py_ssize_t wlen;
3489 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3490 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003491 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003492 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003493
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003494 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003495 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003496 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003497 return NULL;
3498 }
3499
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003500 char *str;
3501 size_t error_pos;
3502 const char *reason;
3503 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003504 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003505 PyMem_Free(wstr);
3506
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003507 if (res != 0) {
3508 if (res == -2) {
3509 PyObject *exc;
3510 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3511 "locale", unicode,
3512 (Py_ssize_t)error_pos,
3513 (Py_ssize_t)(error_pos+1),
3514 reason);
3515 if (exc != NULL) {
3516 PyCodec_StrictErrors(exc);
3517 Py_DECREF(exc);
3518 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003519 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003520 else if (res == -3) {
3521 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3522 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003523 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003524 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003525 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003526 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003527 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003528
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003529 PyObject *bytes = PyBytes_FromString(str);
3530 PyMem_RawFree(str);
3531 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003532}
3533
Victor Stinnerad158722010-10-27 00:25:46 +00003534PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003535PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3536{
Victor Stinner709d23d2019-05-02 14:56:30 -04003537 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3538 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003539}
3540
3541PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003542PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003543{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003544 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003545#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003546 if (interp->fs_codec.encoding) {
3547 return unicode_encode_utf8(unicode,
3548 interp->fs_codec.error_handler,
3549 interp->fs_codec.errors);
3550 }
3551 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003552 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003553 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003554 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003555 assert(errors != _Py_ERROR_UNKNOWN);
3556 return unicode_encode_utf8(unicode, errors, NULL);
3557 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003558#else
Victor Stinner793b5312011-04-27 00:24:21 +02003559 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3560 cannot use it to encode and decode filenames before it is loaded. Load
3561 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003562 implementation of the locale codec until the codec registry is
3563 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003564 if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003565 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003566 interp->fs_codec.encoding,
3567 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003568 }
3569 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003570 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003571 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003572 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003573 assert(errors != _Py_ERROR_UNKNOWN);
3574 return unicode_encode_locale(unicode, errors, 0);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003575 }
Victor Stinnerad158722010-10-27 00:25:46 +00003576#endif
Victor Stinnerae6265f2010-05-15 16:27:27 +00003577}
3578
Alexander Belopolsky40018472011-02-26 01:02:56 +00003579PyObject *
3580PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003581 const char *encoding,
3582 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003583{
3584 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003585 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003586
Guido van Rossumd57fd912000-03-10 22:53:23 +00003587 if (!PyUnicode_Check(unicode)) {
3588 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003589 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003590 }
Fred Drakee4315f52000-05-09 19:53:39 +00003591
Victor Stinner942889a2016-09-05 15:40:10 -07003592 if (encoding == NULL) {
3593 return _PyUnicode_AsUTF8String(unicode, errors);
3594 }
3595
Fred Drakee4315f52000-05-09 19:53:39 +00003596 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003597 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3598 char *lower = buflower;
3599
3600 /* Fast paths */
3601 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3602 lower += 3;
3603 if (*lower == '_') {
3604 /* Match "utf8" and "utf_8" */
3605 lower++;
3606 }
3607
3608 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003609 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003610 }
3611 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3612 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3613 }
3614 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3615 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3616 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003617 }
Victor Stinner942889a2016-09-05 15:40:10 -07003618 else {
3619 if (strcmp(lower, "ascii") == 0
3620 || strcmp(lower, "us_ascii") == 0) {
3621 return _PyUnicode_AsASCIIString(unicode, errors);
3622 }
Steve Dowercc16be82016-09-08 10:35:16 -07003623#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003624 else if (strcmp(lower, "mbcs") == 0) {
3625 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3626 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003627#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003628 else if (strcmp(lower, "latin1") == 0 ||
3629 strcmp(lower, "latin_1") == 0 ||
3630 strcmp(lower, "iso_8859_1") == 0 ||
3631 strcmp(lower, "iso8859_1") == 0) {
3632 return _PyUnicode_AsLatin1String(unicode, errors);
3633 }
3634 }
Victor Stinner37296e82010-06-10 13:36:23 +00003635 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003636
3637 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003638 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003639 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003640 return NULL;
3641
3642 /* The normal path */
3643 if (PyBytes_Check(v))
3644 return v;
3645
3646 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003647 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003648 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003649 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003650
3651 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003652 "encoder %s returned bytearray instead of bytes; "
3653 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003654 encoding);
3655 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003656 Py_DECREF(v);
3657 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003658 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003659
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003660 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3661 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003662 Py_DECREF(v);
3663 return b;
3664 }
3665
3666 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003667 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003668 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003669 encoding,
3670 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003671 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003672 return NULL;
3673}
3674
Alexander Belopolsky40018472011-02-26 01:02:56 +00003675PyObject *
3676PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003677 const char *encoding,
3678 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003679{
3680 PyObject *v;
3681
3682 if (!PyUnicode_Check(unicode)) {
3683 PyErr_BadArgument();
3684 goto onError;
3685 }
3686
Serhiy Storchaka00939072016-10-27 21:05:49 +03003687 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3688 "PyUnicode_AsEncodedUnicode() is deprecated; "
3689 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3690 return NULL;
3691
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003692 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003693 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003694
3695 /* Encode via the codec registry */
3696 v = PyCodec_Encode(unicode, encoding, errors);
3697 if (v == NULL)
3698 goto onError;
3699 if (!PyUnicode_Check(v)) {
3700 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003701 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003702 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003703 encoding,
3704 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003705 Py_DECREF(v);
3706 goto onError;
3707 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003709
Benjamin Peterson29060642009-01-31 22:14:21 +00003710 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003711 return NULL;
3712}
3713
Victor Stinner2cba6b82018-01-10 22:46:15 +01003714static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003715unicode_decode_locale(const char *str, Py_ssize_t len,
3716 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003717{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003718 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3719 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003720 return NULL;
3721 }
3722
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003723 wchar_t *wstr;
3724 size_t wlen;
3725 const char *reason;
3726 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003727 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003728 if (res != 0) {
3729 if (res == -2) {
3730 PyObject *exc;
3731 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3732 "locale", str, len,
3733 (Py_ssize_t)wlen,
3734 (Py_ssize_t)(wlen + 1),
3735 reason);
3736 if (exc != NULL) {
3737 PyCodec_StrictErrors(exc);
3738 Py_DECREF(exc);
3739 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003740 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003741 else if (res == -3) {
3742 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3743 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003744 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003745 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003746 }
Victor Stinner2f197072011-12-17 07:08:30 +01003747 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003748 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003749
3750 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3751 PyMem_RawFree(wstr);
3752 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003753}
3754
3755PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003756PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3757 const char *errors)
3758{
Victor Stinner709d23d2019-05-02 14:56:30 -04003759 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3760 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003761}
3762
3763PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003764PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003765{
3766 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003767 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3768 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003769}
3770
3771
3772PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003773PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003774 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003775 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3776}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003777
Christian Heimes5894ba72007-11-04 11:43:14 +00003778PyObject*
3779PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3780{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003781 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnere2510952019-05-02 11:28:57 -04003782#ifdef _Py_FORCE_UTF8_FS_ENCODING
Victor Stinner709d23d2019-05-02 14:56:30 -04003783 if (interp->fs_codec.encoding) {
3784 return unicode_decode_utf8(s, size,
3785 interp->fs_codec.error_handler,
3786 interp->fs_codec.errors,
3787 NULL);
3788 }
3789 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003790 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003791 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003792 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003793 assert(errors != _Py_ERROR_UNKNOWN);
3794 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3795 }
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003796#else
Victor Stinner793b5312011-04-27 00:24:21 +02003797 /* Bootstrap check: if the filesystem codec is implemented in Python, we
3798 cannot use it to encode and decode filenames before it is loaded. Load
3799 the Python codec requires to encode at least its own filename. Use the C
Victor Stinnerb2457ef2018-08-29 13:25:36 +02003800 implementation of the locale codec until the codec registry is
3801 initialized and the Python codec is loaded. See initfsencoding(). */
Victor Stinner709d23d2019-05-02 14:56:30 -04003802 if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003803 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003804 interp->fs_codec.encoding,
3805 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003806 }
3807 else {
Victor Stinner331a6a52019-05-27 16:39:22 +02003808 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
Victor Stinner709d23d2019-05-02 14:56:30 -04003809 _Py_error_handler errors;
Victor Stinner331a6a52019-05-27 16:39:22 +02003810 errors = get_error_handler_wide(filesystem_errors);
Victor Stinner709d23d2019-05-02 14:56:30 -04003811 return unicode_decode_locale(s, size, errors, 0);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003812 }
Victor Stinnerad158722010-10-27 00:25:46 +00003813#endif
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003814}
3815
Martin v. Löwis011e8422009-05-05 04:43:17 +00003816
3817int
3818PyUnicode_FSConverter(PyObject* arg, void* addr)
3819{
Brett Cannonec6ce872016-09-06 15:50:29 -07003820 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003821 PyObject *output = NULL;
3822 Py_ssize_t size;
3823 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003824 if (arg == NULL) {
3825 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003826 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003827 return 1;
3828 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003829 path = PyOS_FSPath(arg);
3830 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003831 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003832 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003833 if (PyBytes_Check(path)) {
3834 output = path;
3835 }
3836 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3837 output = PyUnicode_EncodeFSDefault(path);
3838 Py_DECREF(path);
3839 if (!output) {
3840 return 0;
3841 }
3842 assert(PyBytes_Check(output));
3843 }
3844
Victor Stinner0ea2a462010-04-30 00:22:08 +00003845 size = PyBytes_GET_SIZE(output);
3846 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003847 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003848 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003849 Py_DECREF(output);
3850 return 0;
3851 }
3852 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003853 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003854}
3855
3856
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003857int
3858PyUnicode_FSDecoder(PyObject* arg, void* addr)
3859{
Brett Cannona5711202016-09-06 19:36:01 -07003860 int is_buffer = 0;
3861 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003862 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003863 if (arg == NULL) {
3864 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003865 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003866 return 1;
3867 }
Brett Cannona5711202016-09-06 19:36:01 -07003868
3869 is_buffer = PyObject_CheckBuffer(arg);
3870 if (!is_buffer) {
3871 path = PyOS_FSPath(arg);
3872 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003873 return 0;
3874 }
Brett Cannona5711202016-09-06 19:36:01 -07003875 }
3876 else {
3877 path = arg;
3878 Py_INCREF(arg);
3879 }
3880
3881 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003882 output = path;
3883 }
3884 else if (PyBytes_Check(path) || is_buffer) {
3885 PyObject *path_bytes = NULL;
3886
3887 if (!PyBytes_Check(path) &&
3888 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003889 "path should be string, bytes, or os.PathLike, not %.200s",
3890 Py_TYPE(arg)->tp_name)) {
3891 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003892 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003893 }
3894 path_bytes = PyBytes_FromObject(path);
3895 Py_DECREF(path);
3896 if (!path_bytes) {
3897 return 0;
3898 }
3899 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3900 PyBytes_GET_SIZE(path_bytes));
3901 Py_DECREF(path_bytes);
3902 if (!output) {
3903 return 0;
3904 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003905 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003906 else {
3907 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003908 "path should be string, bytes, or os.PathLike, not %.200s",
3909 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003910 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003911 return 0;
3912 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003913 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003914 Py_DECREF(output);
3915 return 0;
3916 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003917 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003918 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003919 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003920 Py_DECREF(output);
3921 return 0;
3922 }
3923 *(PyObject**)addr = output;
3924 return Py_CLEANUP_SUPPORTED;
3925}
3926
3927
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003928const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003929PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003930{
Christian Heimesf3863112007-11-22 07:46:41 +00003931 PyObject *bytes;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003932
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003933 if (!PyUnicode_Check(unicode)) {
3934 PyErr_BadArgument();
3935 return NULL;
3936 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003937 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003938 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003939
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003940 if (PyUnicode_UTF8(unicode) == NULL) {
3941 assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003942 bytes = _PyUnicode_AsUTF8String(unicode, NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003943 if (bytes == NULL)
3944 return NULL;
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003945 _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3946 if (_PyUnicode_UTF8(unicode) == NULL) {
Victor Stinnera5afb582013-10-29 01:28:23 +01003947 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003948 Py_DECREF(bytes);
3949 return NULL;
3950 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003951 _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
Christian Heimesf051e432016-09-13 20:22:02 +02003952 memcpy(_PyUnicode_UTF8(unicode),
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003953 PyBytes_AS_STRING(bytes),
3954 _PyUnicode_UTF8_LENGTH(unicode) + 1);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003955 Py_DECREF(bytes);
3956 }
3957
3958 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02003959 *psize = PyUnicode_UTF8_LENGTH(unicode);
3960 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003961}
3962
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003963const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003964PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00003965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003966 return PyUnicode_AsUTF8AndSize(unicode, NULL);
3967}
3968
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003969Py_UNICODE *
3970PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3971{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003972 if (!PyUnicode_Check(unicode)) {
3973 PyErr_BadArgument();
3974 return NULL;
3975 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003976 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
3977 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003978 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03003979 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003980 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981
Serhiy Storchakac46db922018-10-23 22:58:24 +03003982 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
3983 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3984 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003985 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003986 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003987 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
3988 if (w == NULL) {
3989 PyErr_NoMemory();
3990 return NULL;
3991 }
3992 unicode_copy_as_widechar(unicode, w, wlen + 1);
3993 _PyUnicode_WSTR(unicode) = w;
3994 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
3995 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003996 }
3997 }
3998 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02003999 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004000 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004001}
4002
Alexander Belopolsky40018472011-02-26 01:02:56 +00004003Py_UNICODE *
4004PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004005{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004007}
4008
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004009const Py_UNICODE *
4010_PyUnicode_AsUnicode(PyObject *unicode)
4011{
4012 Py_ssize_t size;
4013 const Py_UNICODE *wstr;
4014
4015 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4016 if (wstr && wcslen(wstr) != (size_t)size) {
4017 PyErr_SetString(PyExc_ValueError, "embedded null character");
4018 return NULL;
4019 }
4020 return wstr;
4021}
4022
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004023
Alexander Belopolsky40018472011-02-26 01:02:56 +00004024Py_ssize_t
4025PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004026{
4027 if (!PyUnicode_Check(unicode)) {
4028 PyErr_BadArgument();
4029 goto onError;
4030 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004031 if (_PyUnicode_WSTR(unicode) == NULL) {
4032 if (PyUnicode_AsUnicode(unicode) == NULL)
4033 goto onError;
4034 }
4035 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004036
Benjamin Peterson29060642009-01-31 22:14:21 +00004037 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004038 return -1;
4039}
4040
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004041Py_ssize_t
4042PyUnicode_GetLength(PyObject *unicode)
4043{
Victor Stinner07621332012-06-16 04:53:46 +02004044 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004045 PyErr_BadArgument();
4046 return -1;
4047 }
Victor Stinner07621332012-06-16 04:53:46 +02004048 if (PyUnicode_READY(unicode) == -1)
4049 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004050 return PyUnicode_GET_LENGTH(unicode);
4051}
4052
4053Py_UCS4
4054PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4055{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004056 void *data;
4057 int kind;
4058
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004059 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004060 PyErr_BadArgument();
4061 return (Py_UCS4)-1;
4062 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004063 if (PyUnicode_READY(unicode) == -1) {
4064 return (Py_UCS4)-1;
4065 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004066 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004067 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004068 return (Py_UCS4)-1;
4069 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004070 data = PyUnicode_DATA(unicode);
4071 kind = PyUnicode_KIND(unicode);
4072 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004073}
4074
4075int
4076PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4077{
4078 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004079 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004080 return -1;
4081 }
Victor Stinner488fa492011-12-12 00:01:39 +01004082 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004083 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004084 PyErr_SetString(PyExc_IndexError, "string index out of range");
4085 return -1;
4086 }
Victor Stinner488fa492011-12-12 00:01:39 +01004087 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004088 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004089 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4090 PyErr_SetString(PyExc_ValueError, "character out of range");
4091 return -1;
4092 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4094 index, ch);
4095 return 0;
4096}
4097
Alexander Belopolsky40018472011-02-26 01:02:56 +00004098const char *
4099PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004100{
Victor Stinner42cb4622010-09-01 19:39:01 +00004101 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004102}
4103
Victor Stinner554f3f02010-06-16 23:33:54 +00004104/* create or adjust a UnicodeDecodeError */
4105static void
4106make_decode_exception(PyObject **exceptionObject,
4107 const char *encoding,
4108 const char *input, Py_ssize_t length,
4109 Py_ssize_t startpos, Py_ssize_t endpos,
4110 const char *reason)
4111{
4112 if (*exceptionObject == NULL) {
4113 *exceptionObject = PyUnicodeDecodeError_Create(
4114 encoding, input, length, startpos, endpos, reason);
4115 }
4116 else {
4117 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4118 goto onError;
4119 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4120 goto onError;
4121 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4122 goto onError;
4123 }
4124 return;
4125
4126onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004127 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004128}
4129
Steve Dowercc16be82016-09-08 10:35:16 -07004130#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004131static int
4132widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4133{
4134 if (newsize > *size) {
4135 wchar_t *newbuf = *buf;
4136 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4137 PyErr_NoMemory();
4138 return -1;
4139 }
4140 *buf = newbuf;
4141 }
4142 *size = newsize;
4143 return 0;
4144}
4145
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004146/* error handling callback helper:
4147 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004148 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004149 and adjust various state variables.
4150 return 0 on success, -1 on error
4151*/
4152
Alexander Belopolsky40018472011-02-26 01:02:56 +00004153static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004154unicode_decode_call_errorhandler_wchar(
4155 const char *errors, PyObject **errorHandler,
4156 const char *encoding, const char *reason,
4157 const char **input, const char **inend, Py_ssize_t *startinpos,
4158 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004159 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004160{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004161 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004162
4163 PyObject *restuple = NULL;
4164 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004165 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004166 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004167 Py_ssize_t requiredsize;
4168 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004169 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004170 wchar_t *repwstr;
4171 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004172
4173 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004174 *errorHandler = PyCodec_LookupError(errors);
4175 if (*errorHandler == NULL)
4176 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004177 }
4178
Victor Stinner554f3f02010-06-16 23:33:54 +00004179 make_decode_exception(exceptionObject,
4180 encoding,
4181 *input, *inend - *input,
4182 *startinpos, *endinpos,
4183 reason);
4184 if (*exceptionObject == NULL)
4185 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004186
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004187 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004188 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004189 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004190 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004191 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004192 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004193 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004194 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004195 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004196
4197 /* Copy back the bytes variables, which might have been modified by the
4198 callback */
4199 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4200 if (!inputobj)
4201 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004202 *input = PyBytes_AS_STRING(inputobj);
4203 insize = PyBytes_GET_SIZE(inputobj);
4204 *inend = *input + insize;
4205 /* we can DECREF safely, as the exception has another reference,
4206 so the object won't go away. */
4207 Py_DECREF(inputobj);
4208
4209 if (newpos<0)
4210 newpos = insize+newpos;
4211 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004212 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004213 goto onError;
4214 }
4215
4216 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4217 if (repwstr == NULL)
4218 goto onError;
4219 /* need more space? (at least enough for what we
4220 have+the replacement+the rest of the string (starting
4221 at the new input position), so we won't have to check space
4222 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004223 requiredsize = *outpos;
4224 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4225 goto overflow;
4226 requiredsize += repwlen;
4227 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4228 goto overflow;
4229 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004230 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004231 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004232 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004233 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004234 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004235 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004236 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004237 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004238 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004239 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004240 *endinpos = newpos;
4241 *inptr = *input + newpos;
4242
4243 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004244 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004245 return 0;
4246
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004247 overflow:
4248 PyErr_SetString(PyExc_OverflowError,
4249 "decoded result is too long for a Python string");
4250
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004251 onError:
4252 Py_XDECREF(restuple);
4253 return -1;
4254}
Steve Dowercc16be82016-09-08 10:35:16 -07004255#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004256
4257static int
4258unicode_decode_call_errorhandler_writer(
4259 const char *errors, PyObject **errorHandler,
4260 const char *encoding, const char *reason,
4261 const char **input, const char **inend, Py_ssize_t *startinpos,
4262 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4263 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4264{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004265 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004266
4267 PyObject *restuple = NULL;
4268 PyObject *repunicode = NULL;
4269 Py_ssize_t insize;
4270 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004271 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004272 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004273 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004274 int need_to_grow = 0;
4275 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004276
4277 if (*errorHandler == NULL) {
4278 *errorHandler = PyCodec_LookupError(errors);
4279 if (*errorHandler == NULL)
4280 goto onError;
4281 }
4282
4283 make_decode_exception(exceptionObject,
4284 encoding,
4285 *input, *inend - *input,
4286 *startinpos, *endinpos,
4287 reason);
4288 if (*exceptionObject == NULL)
4289 goto onError;
4290
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01004291 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 if (restuple == NULL)
4293 goto onError;
4294 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004295 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004296 goto onError;
4297 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004298 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004299 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004300
4301 /* Copy back the bytes variables, which might have been modified by the
4302 callback */
4303 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4304 if (!inputobj)
4305 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004306 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004307 *input = PyBytes_AS_STRING(inputobj);
4308 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004309 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004310 /* we can DECREF safely, as the exception has another reference,
4311 so the object won't go away. */
4312 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004313
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004314 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004315 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004316 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004317 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004318 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004319 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004320
Victor Stinner170ca6f2013-04-18 00:25:28 +02004321 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004322 if (replen > 1) {
4323 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004324 need_to_grow = 1;
4325 }
4326 new_inptr = *input + newpos;
4327 if (*inend - new_inptr > remain) {
4328 /* We don't know the decoding algorithm here so we make the worst
4329 assumption that one byte decodes to one unicode character.
4330 If unfortunately one byte could decode to more unicode characters,
4331 the decoder may write out-of-bound then. Is it possible for the
4332 algorithms using this function? */
4333 writer->min_length += *inend - new_inptr - remain;
4334 need_to_grow = 1;
4335 }
4336 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004337 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004338 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004339 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4340 goto onError;
4341 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004342 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004343 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004345 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004346 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004347
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004348 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004349 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004350 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004351
Benjamin Peterson29060642009-01-31 22:14:21 +00004352 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004353 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004354 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004355}
4356
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004357/* --- UTF-7 Codec -------------------------------------------------------- */
4358
Antoine Pitrou244651a2009-05-04 18:56:13 +00004359/* See RFC2152 for details. We encode conservatively and decode liberally. */
4360
4361/* Three simple macros defining base-64. */
4362
4363/* Is c a base-64 character? */
4364
4365#define IS_BASE64(c) \
4366 (((c) >= 'A' && (c) <= 'Z') || \
4367 ((c) >= 'a' && (c) <= 'z') || \
4368 ((c) >= '0' && (c) <= '9') || \
4369 (c) == '+' || (c) == '/')
4370
4371/* given that c is a base-64 character, what is its base-64 value? */
4372
4373#define FROM_BASE64(c) \
4374 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4375 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4376 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4377 (c) == '+' ? 62 : 63)
4378
4379/* What is the base-64 character of the bottom 6 bits of n? */
4380
4381#define TO_BASE64(n) \
4382 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4383
4384/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4385 * decoded as itself. We are permissive on decoding; the only ASCII
4386 * byte not decoding to itself is the + which begins a base64
4387 * string. */
4388
4389#define DECODE_DIRECT(c) \
4390 ((c) <= 127 && (c) != '+')
4391
4392/* The UTF-7 encoder treats ASCII characters differently according to
4393 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4394 * the above). See RFC2152. This array identifies these different
4395 * sets:
4396 * 0 : "Set D"
4397 * alphanumeric and '(),-./:?
4398 * 1 : "Set O"
4399 * !"#$%&*;<=>@[]^_`{|}
4400 * 2 : "whitespace"
4401 * ht nl cr sp
4402 * 3 : special (must be base64 encoded)
4403 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4404 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004405
Tim Petersced69f82003-09-16 20:30:58 +00004406static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004407char utf7_category[128] = {
4408/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4409 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4410/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4411 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4412/* sp ! " # $ % & ' ( ) * + , - . / */
4413 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4414/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4415 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4416/* @ A B C D E F G H I J K L M N O */
4417 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4418/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4419 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4420/* ` a b c d e f g h i j k l m n o */
4421 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4422/* p q r s t u v w x y z { | } ~ del */
4423 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004424};
4425
Antoine Pitrou244651a2009-05-04 18:56:13 +00004426/* ENCODE_DIRECT: this character should be encoded as itself. The
4427 * answer depends on whether we are encoding set O as itself, and also
4428 * on whether we are encoding whitespace as itself. RFC2152 makes it
4429 * clear that the answers to these questions vary between
4430 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004431
Antoine Pitrou244651a2009-05-04 18:56:13 +00004432#define ENCODE_DIRECT(c, directO, directWS) \
4433 ((c) < 128 && (c) > 0 && \
4434 ((utf7_category[(c)] == 0) || \
4435 (directWS && (utf7_category[(c)] == 2)) || \
4436 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004437
Alexander Belopolsky40018472011-02-26 01:02:56 +00004438PyObject *
4439PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004440 Py_ssize_t size,
4441 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004442{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004443 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4444}
4445
Antoine Pitrou244651a2009-05-04 18:56:13 +00004446/* The decoder. The only state we preserve is our read position,
4447 * i.e. how many characters we have consumed. So if we end in the
4448 * middle of a shift sequence we have to back off the read position
4449 * and the output to the beginning of the sequence, otherwise we lose
4450 * all the shift state (seen bits, number of bits seen, high
4451 * surrogate). */
4452
Alexander Belopolsky40018472011-02-26 01:02:56 +00004453PyObject *
4454PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004455 Py_ssize_t size,
4456 const char *errors,
4457 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004458{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004459 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004460 Py_ssize_t startinpos;
4461 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004462 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004463 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004464 const char *errmsg = "";
4465 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004466 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004467 unsigned int base64bits = 0;
4468 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004469 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004470 PyObject *errorHandler = NULL;
4471 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004472
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004473 if (size == 0) {
4474 if (consumed)
4475 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004476 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004477 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004478
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004479 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004480 _PyUnicodeWriter_Init(&writer);
4481 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004482
4483 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004484 e = s + size;
4485
4486 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004487 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004488 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004489 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004490
Antoine Pitrou244651a2009-05-04 18:56:13 +00004491 if (inShift) { /* in a base-64 section */
4492 if (IS_BASE64(ch)) { /* consume a base-64 character */
4493 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4494 base64bits += 6;
4495 s++;
4496 if (base64bits >= 16) {
4497 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004498 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004499 base64bits -= 16;
4500 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004501 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004502 if (surrogate) {
4503 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004504 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4505 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004506 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004507 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004508 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004509 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004510 }
4511 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004512 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004513 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004514 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004515 }
4516 }
Victor Stinner551ac952011-11-29 22:58:13 +01004517 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004518 /* first surrogate */
4519 surrogate = outCh;
4520 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004521 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004522 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004523 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004524 }
4525 }
4526 }
4527 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004528 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004529 if (base64bits > 0) { /* left-over bits */
4530 if (base64bits >= 6) {
4531 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004532 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004533 errmsg = "partial character in shift sequence";
4534 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004535 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004536 else {
4537 /* Some bits remain; they should be zero */
4538 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004539 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004540 errmsg = "non-zero padding bits in shift sequence";
4541 goto utf7Error;
4542 }
4543 }
4544 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004545 if (surrogate && DECODE_DIRECT(ch)) {
4546 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4547 goto onError;
4548 }
4549 surrogate = 0;
4550 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 /* '-' is absorbed; other terminating
4552 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004553 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004554 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004555 }
4556 }
4557 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004558 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004559 s++; /* consume '+' */
4560 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004561 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004562 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004563 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004564 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004565 else if (s < e && !IS_BASE64(*s)) {
4566 s++;
4567 errmsg = "ill-formed sequence";
4568 goto utf7Error;
4569 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004571 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004572 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004573 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004574 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004575 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004576 }
4577 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004578 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004579 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004580 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004581 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004582 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004583 else {
4584 startinpos = s-starts;
4585 s++;
4586 errmsg = "unexpected special character";
4587 goto utf7Error;
4588 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004589 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004590utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004591 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004592 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004593 errors, &errorHandler,
4594 "utf7", errmsg,
4595 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004596 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004597 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004598 }
4599
Antoine Pitrou244651a2009-05-04 18:56:13 +00004600 /* end of string */
4601
4602 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4603 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004604 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004605 if (surrogate ||
4606 (base64bits >= 6) ||
4607 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004608 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004609 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004610 errors, &errorHandler,
4611 "utf7", "unterminated shift sequence",
4612 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004613 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004614 goto onError;
4615 if (s < e)
4616 goto restart;
4617 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004618 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004619
4620 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004621 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004623 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004624 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004625 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004626 writer.kind, writer.data, shiftOutStart);
4627 Py_XDECREF(errorHandler);
4628 Py_XDECREF(exc);
4629 _PyUnicodeWriter_Dealloc(&writer);
4630 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004631 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004632 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004633 }
4634 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004635 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004636 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004637 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004638
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004639 Py_XDECREF(errorHandler);
4640 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004641 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004642
Benjamin Peterson29060642009-01-31 22:14:21 +00004643 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004644 Py_XDECREF(errorHandler);
4645 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004646 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004647 return NULL;
4648}
4649
4650
Alexander Belopolsky40018472011-02-26 01:02:56 +00004651PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004652_PyUnicode_EncodeUTF7(PyObject *str,
4653 int base64SetO,
4654 int base64WhiteSpace,
4655 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004656{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004657 int kind;
4658 void *data;
4659 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004660 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004661 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004662 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004663 unsigned int base64bits = 0;
4664 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004665 char * out;
4666 char * start;
4667
Benjamin Petersonbac79492012-01-14 13:34:47 -05004668 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004669 return NULL;
4670 kind = PyUnicode_KIND(str);
4671 data = PyUnicode_DATA(str);
4672 len = PyUnicode_GET_LENGTH(str);
4673
4674 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004675 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004676
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004677 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004678 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004679 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004680 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004681 if (v == NULL)
4682 return NULL;
4683
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004684 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004685 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004686 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004687
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 if (inShift) {
4689 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4690 /* shifting out */
4691 if (base64bits) { /* output remaining bits */
4692 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4693 base64buffer = 0;
4694 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004695 }
4696 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004697 /* Characters not in the BASE64 set implicitly unshift the sequence
4698 so no '-' is required, except if the character is itself a '-' */
4699 if (IS_BASE64(ch) || ch == '-') {
4700 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004701 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004702 *out++ = (char) ch;
4703 }
4704 else {
4705 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004706 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004707 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004708 else { /* not in a shift sequence */
4709 if (ch == '+') {
4710 *out++ = '+';
4711 *out++ = '-';
4712 }
4713 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4714 *out++ = (char) ch;
4715 }
4716 else {
4717 *out++ = '+';
4718 inShift = 1;
4719 goto encode_char;
4720 }
4721 }
4722 continue;
4723encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004724 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004725 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004726
Antoine Pitrou244651a2009-05-04 18:56:13 +00004727 /* code first surrogate */
4728 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004729 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004730 while (base64bits >= 6) {
4731 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4732 base64bits -= 6;
4733 }
4734 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004735 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004736 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004737 base64bits += 16;
4738 base64buffer = (base64buffer << 16) | ch;
4739 while (base64bits >= 6) {
4740 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4741 base64bits -= 6;
4742 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004743 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004744 if (base64bits)
4745 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4746 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004748 if (_PyBytes_Resize(&v, out - start) < 0)
4749 return NULL;
4750 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004751}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004752PyObject *
4753PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4754 Py_ssize_t size,
4755 int base64SetO,
4756 int base64WhiteSpace,
4757 const char *errors)
4758{
4759 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004760 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004761 if (tmp == NULL)
4762 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004763 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004764 base64WhiteSpace, errors);
4765 Py_DECREF(tmp);
4766 return result;
4767}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004768
Antoine Pitrou244651a2009-05-04 18:56:13 +00004769#undef IS_BASE64
4770#undef FROM_BASE64
4771#undef TO_BASE64
4772#undef DECODE_DIRECT
4773#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004774
Guido van Rossumd57fd912000-03-10 22:53:23 +00004775/* --- UTF-8 Codec -------------------------------------------------------- */
4776
Alexander Belopolsky40018472011-02-26 01:02:56 +00004777PyObject *
4778PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004779 Py_ssize_t size,
4780 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004781{
Walter Dörwald69652032004-09-07 20:24:22 +00004782 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4783}
4784
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004785#include "stringlib/asciilib.h"
4786#include "stringlib/codecs.h"
4787#include "stringlib/undef.h"
4788
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004789#include "stringlib/ucs1lib.h"
4790#include "stringlib/codecs.h"
4791#include "stringlib/undef.h"
4792
4793#include "stringlib/ucs2lib.h"
4794#include "stringlib/codecs.h"
4795#include "stringlib/undef.h"
4796
4797#include "stringlib/ucs4lib.h"
4798#include "stringlib/codecs.h"
4799#include "stringlib/undef.h"
4800
Antoine Pitrouab868312009-01-10 15:40:25 +00004801/* Mask to quickly check whether a C 'long' contains a
4802 non-ASCII, UTF8-encoded char. */
4803#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004804# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004805#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004806# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004807#else
4808# error C 'long' size should be either 4 or 8!
4809#endif
4810
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004811static Py_ssize_t
4812ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004813{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004814 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004815 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004816
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004817 /*
4818 * Issue #17237: m68k is a bit different from most architectures in
4819 * that objects do not use "natural alignment" - for example, int and
4820 * long are only aligned at 2-byte boundaries. Therefore the assert()
4821 * won't work; also, tests have shown that skipping the "optimised
4822 * version" will even speed up m68k.
4823 */
4824#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004825#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004826 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4827 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004828 /* Fast path, see in STRINGLIB(utf8_decode) for
4829 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004830 /* Help allocation */
4831 const char *_p = p;
4832 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004833 while (_p < aligned_end) {
4834 unsigned long value = *(const unsigned long *) _p;
4835 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004836 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837 *((unsigned long *)q) = value;
4838 _p += SIZEOF_LONG;
4839 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004840 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004841 p = _p;
4842 while (p < end) {
4843 if ((unsigned char)*p & 0x80)
4844 break;
4845 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004846 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004847 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004848 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004849#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004850#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004851 while (p < end) {
4852 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4853 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004854 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004855 /* Help allocation */
4856 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004857 while (_p < aligned_end) {
4858 unsigned long value = *(unsigned long *) _p;
4859 if (value & ASCII_CHAR_MASK)
4860 break;
4861 _p += SIZEOF_LONG;
4862 }
4863 p = _p;
4864 if (_p == end)
4865 break;
4866 }
4867 if ((unsigned char)*p & 0x80)
4868 break;
4869 ++p;
4870 }
4871 memcpy(dest, start, p - start);
4872 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004873}
Antoine Pitrouab868312009-01-10 15:40:25 +00004874
Victor Stinner709d23d2019-05-02 14:56:30 -04004875static PyObject *
4876unicode_decode_utf8(const char *s, Py_ssize_t size,
4877 _Py_error_handler error_handler, const char *errors,
4878 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004879{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004880 _PyUnicodeWriter writer;
Victor Stinner785938e2011-12-11 20:09:03 +01004881 const char *starts = s;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004882 const char *end = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004883
4884 Py_ssize_t startinpos;
4885 Py_ssize_t endinpos;
4886 const char *errmsg = "";
Victor Stinner1d65d912015-10-05 13:43:50 +02004887 PyObject *error_handler_obj = NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004888 PyObject *exc = NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01004889
4890 if (size == 0) {
4891 if (consumed)
4892 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004893 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004894 }
4895
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004896 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4897 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004898 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 *consumed = 1;
4900 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004901 }
4902
Victor Stinner8f674cc2013-04-17 23:02:17 +02004903 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02004904 writer.min_length = size;
4905 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004906 goto onError;
Victor Stinner785938e2011-12-11 20:09:03 +01004907
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004908 writer.pos = ascii_decode(s, end, writer.data);
4909 s += writer.pos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004910 while (s < end) {
4911 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004912 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004913
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004914 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004915 if (PyUnicode_IS_ASCII(writer.buffer))
4916 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004917 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004918 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004919 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004920 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004921 } else {
4922 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004923 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004924 }
4925
4926 switch (ch) {
4927 case 0:
4928 if (s == end || consumed)
4929 goto End;
4930 errmsg = "unexpected end of data";
4931 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004932 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004933 break;
4934 case 1:
4935 errmsg = "invalid start byte";
4936 startinpos = s - starts;
4937 endinpos = startinpos + 1;
4938 break;
4939 case 2:
Miss Islington (bot)d32594a2019-06-25 02:12:16 -07004940 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4941 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
4942 {
4943 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02004944 goto End;
4945 }
Miss Islington (bot)d32594a2019-06-25 02:12:16 -07004946 /* fall through */
4947 case 3:
4948 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004949 errmsg = "invalid continuation byte";
4950 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004951 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004952 break;
4953 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004954 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004955 goto onError;
4956 continue;
4957 }
4958
Victor Stinner1d65d912015-10-05 13:43:50 +02004959 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02004960 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02004961
4962 switch (error_handler) {
4963 case _Py_ERROR_IGNORE:
4964 s += (endinpos - startinpos);
4965 break;
4966
4967 case _Py_ERROR_REPLACE:
4968 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
4969 goto onError;
4970 s += (endinpos - startinpos);
4971 break;
4972
4973 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02004974 {
4975 Py_ssize_t i;
4976
Victor Stinner1d65d912015-10-05 13:43:50 +02004977 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
4978 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004979 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02004980 ch = (Py_UCS4)(unsigned char)(starts[i]);
4981 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
4982 ch + 0xdc00);
4983 writer.pos++;
4984 }
4985 s += (endinpos - startinpos);
4986 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02004987 }
Victor Stinner1d65d912015-10-05 13:43:50 +02004988
4989 default:
4990 if (unicode_decode_call_errorhandler_writer(
4991 errors, &error_handler_obj,
4992 "utf-8", errmsg,
4993 &starts, &end, &startinpos, &endinpos, &exc, &s,
4994 &writer))
4995 goto onError;
4996 }
Victor Stinner785938e2011-12-11 20:09:03 +01004997 }
4998
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004999End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005000 if (consumed)
5001 *consumed = s - starts;
5002
Victor Stinner1d65d912015-10-05 13:43:50 +02005003 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005004 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005005 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005006
5007onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005008 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005009 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005010 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005011 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005012}
5013
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005014
Victor Stinner709d23d2019-05-02 14:56:30 -04005015PyObject *
5016PyUnicode_DecodeUTF8Stateful(const char *s,
5017 Py_ssize_t size,
5018 const char *errors,
5019 Py_ssize_t *consumed)
5020{
5021 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5022}
5023
5024
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005025/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5026 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005027
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005028 On success, write a pointer to a newly allocated wide character string into
5029 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5030 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005031
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005032 On memory allocation failure, return -1.
5033
5034 On decoding error (if surrogateescape is zero), return -2. If wlen is
5035 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5036 is not NULL, write the decoding error message into *reason. */
5037int
5038_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005039 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005040{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005041 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005042 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005043 wchar_t *unicode;
5044 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005045
Victor Stinner3d4226a2018-08-29 22:21:32 +02005046 int surrogateescape = 0;
5047 int surrogatepass = 0;
5048 switch (errors)
5049 {
5050 case _Py_ERROR_STRICT:
5051 break;
5052 case _Py_ERROR_SURROGATEESCAPE:
5053 surrogateescape = 1;
5054 break;
5055 case _Py_ERROR_SURROGATEPASS:
5056 surrogatepass = 1;
5057 break;
5058 default:
5059 return -3;
5060 }
5061
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005062 /* Note: size will always be longer than the resulting Unicode
5063 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005064 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005065 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005066 }
5067
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005068 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005069 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005070 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005071 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005072
5073 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005074 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005075 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005076 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005077 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005078#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005079 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005080#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005081 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005082#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005083 if (ch > 0xFF) {
5084#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005085 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005086#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005087 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005088 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005089 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5090 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5091#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005092 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005093 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005094 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005095 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005096 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005097
5098 if (surrogateescape) {
5099 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5100 }
5101 else {
5102 /* Is it a valid three-byte code? */
5103 if (surrogatepass
5104 && (e - s) >= 3
5105 && (s[0] & 0xf0) == 0xe0
5106 && (s[1] & 0xc0) == 0x80
5107 && (s[2] & 0xc0) == 0x80)
5108 {
5109 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5110 s += 3;
5111 unicode[outpos++] = ch;
5112 }
5113 else {
5114 PyMem_RawFree(unicode );
5115 if (reason != NULL) {
5116 switch (ch) {
5117 case 0:
5118 *reason = "unexpected end of data";
5119 break;
5120 case 1:
5121 *reason = "invalid start byte";
5122 break;
5123 /* 2, 3, 4 */
5124 default:
5125 *reason = "invalid continuation byte";
5126 break;
5127 }
5128 }
5129 if (wlen != NULL) {
5130 *wlen = s - orig_s;
5131 }
5132 return -2;
5133 }
5134 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005138 if (wlen) {
5139 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005140 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005141 *wstr = unicode;
5142 return 0;
5143}
5144
Victor Stinner5f9cf232019-03-19 01:46:25 +01005145
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005146wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005147_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5148 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005149{
5150 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005151 int res = _Py_DecodeUTF8Ex(arg, arglen,
5152 &wstr, wlen,
5153 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005154 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005155 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5156 assert(res != -3);
5157 if (wlen) {
5158 *wlen = (size_t)res;
5159 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005160 return NULL;
5161 }
5162 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005163}
5164
Antoine Pitrouab868312009-01-10 15:40:25 +00005165
Victor Stinnere47e6982017-12-21 15:45:16 +01005166/* UTF-8 encoder using the surrogateescape error handler .
5167
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005168 On success, return 0 and write the newly allocated character string (use
5169 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005170
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005171 On encoding failure, return -2 and write the position of the invalid
5172 surrogate character into *error_pos (if error_pos is set) and the decoding
5173 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005174
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005175 On memory allocation failure, return -1. */
5176int
5177_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005178 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005179{
5180 const Py_ssize_t max_char_size = 4;
5181 Py_ssize_t len = wcslen(text);
5182
5183 assert(len >= 0);
5184
Victor Stinner3d4226a2018-08-29 22:21:32 +02005185 int surrogateescape = 0;
5186 int surrogatepass = 0;
5187 switch (errors)
5188 {
5189 case _Py_ERROR_STRICT:
5190 break;
5191 case _Py_ERROR_SURROGATEESCAPE:
5192 surrogateescape = 1;
5193 break;
5194 case _Py_ERROR_SURROGATEPASS:
5195 surrogatepass = 1;
5196 break;
5197 default:
5198 return -3;
5199 }
5200
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005201 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5202 return -1;
5203 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005204 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005205 if (raw_malloc) {
5206 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005207 }
5208 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005209 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005210 }
5211 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005212 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005213 }
5214
5215 char *p = bytes;
5216 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005217 for (i = 0; i < len; ) {
5218 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005219 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005220 i++;
5221#if Py_UNICODE_SIZE == 2
5222 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5223 && i < len
5224 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5225 {
5226 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5227 i++;
5228 }
5229#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005230
5231 if (ch < 0x80) {
5232 /* Encode ASCII */
5233 *p++ = (char) ch;
5234
5235 }
5236 else if (ch < 0x0800) {
5237 /* Encode Latin-1 */
5238 *p++ = (char)(0xc0 | (ch >> 6));
5239 *p++ = (char)(0x80 | (ch & 0x3f));
5240 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005241 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005242 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005243 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005244 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005245 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005246 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005247 if (reason != NULL) {
5248 *reason = "encoding error";
5249 }
5250 if (raw_malloc) {
5251 PyMem_RawFree(bytes);
5252 }
5253 else {
5254 PyMem_Free(bytes);
5255 }
5256 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005257 }
5258 *p++ = (char)(ch & 0xff);
5259 }
5260 else if (ch < 0x10000) {
5261 *p++ = (char)(0xe0 | (ch >> 12));
5262 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5263 *p++ = (char)(0x80 | (ch & 0x3f));
5264 }
5265 else { /* ch >= 0x10000 */
5266 assert(ch <= MAX_UNICODE);
5267 /* Encode UCS4 Unicode ordinals */
5268 *p++ = (char)(0xf0 | (ch >> 18));
5269 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5270 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5271 *p++ = (char)(0x80 | (ch & 0x3f));
5272 }
5273 }
5274 *p++ = '\0';
5275
5276 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005277 char *bytes2;
5278 if (raw_malloc) {
5279 bytes2 = PyMem_RawRealloc(bytes, final_size);
5280 }
5281 else {
5282 bytes2 = PyMem_Realloc(bytes, final_size);
5283 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005284 if (bytes2 == NULL) {
5285 if (error_pos != NULL) {
5286 *error_pos = (size_t)-1;
5287 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005288 if (raw_malloc) {
5289 PyMem_RawFree(bytes);
5290 }
5291 else {
5292 PyMem_Free(bytes);
5293 }
5294 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005295 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005296 *str = bytes2;
5297 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005298}
5299
5300
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005301/* Primary internal function which creates utf8 encoded bytes objects.
5302
5303 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005304 and allocate exactly as much space needed at the end. Else allocate the
5305 maximum possible needed (4 result bytes per Unicode character), and return
5306 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005307*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005308static PyObject *
5309unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5310 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005311{
Victor Stinner6099a032011-12-18 14:22:26 +01005312 enum PyUnicode_Kind kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005313 void *data;
5314 Py_ssize_t size;
Marc-André Lemburgbd3be8f2002-02-07 11:33:49 +00005315
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005316 if (!PyUnicode_Check(unicode)) {
5317 PyErr_BadArgument();
5318 return NULL;
5319 }
5320
5321 if (PyUnicode_READY(unicode) == -1)
5322 return NULL;
5323
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005324 if (PyUnicode_UTF8(unicode))
5325 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5326 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005327
5328 kind = PyUnicode_KIND(unicode);
5329 data = PyUnicode_DATA(unicode);
5330 size = PyUnicode_GET_LENGTH(unicode);
5331
Benjamin Petersonead6b532011-12-20 17:23:42 -06005332 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005333 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005334 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005335 case PyUnicode_1BYTE_KIND:
5336 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5337 assert(!PyUnicode_IS_ASCII(unicode));
Victor Stinner709d23d2019-05-02 14:56:30 -04005338 return ucs1lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005339 case PyUnicode_2BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005340 return ucs2lib_utf8_encoder(unicode, data, size, error_handler, errors);
Victor Stinner6099a032011-12-18 14:22:26 +01005341 case PyUnicode_4BYTE_KIND:
Victor Stinner709d23d2019-05-02 14:56:30 -04005342 return ucs4lib_utf8_encoder(unicode, data, size, error_handler, errors);
Tim Peters602f7402002-04-27 18:03:26 +00005343 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005344}
5345
Alexander Belopolsky40018472011-02-26 01:02:56 +00005346PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005347_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5348{
5349 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5350}
5351
5352
5353PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005354PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5355 Py_ssize_t size,
5356 const char *errors)
5357{
5358 PyObject *v, *unicode;
5359
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005360 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005361 if (unicode == NULL)
5362 return NULL;
5363 v = _PyUnicode_AsUTF8String(unicode, errors);
5364 Py_DECREF(unicode);
5365 return v;
5366}
5367
5368PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005369PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005370{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005371 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005372}
5373
Walter Dörwald41980ca2007-08-16 21:55:45 +00005374/* --- UTF-32 Codec ------------------------------------------------------- */
5375
5376PyObject *
5377PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005378 Py_ssize_t size,
5379 const char *errors,
5380 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005381{
5382 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5383}
5384
5385PyObject *
5386PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005387 Py_ssize_t size,
5388 const char *errors,
5389 int *byteorder,
5390 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005391{
5392 const char *starts = s;
5393 Py_ssize_t startinpos;
5394 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005395 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005396 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005397 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005398 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005399 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005400 PyObject *errorHandler = NULL;
5401 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005402
Walter Dörwald41980ca2007-08-16 21:55:45 +00005403 q = (unsigned char *)s;
5404 e = q + size;
5405
5406 if (byteorder)
5407 bo = *byteorder;
5408
5409 /* Check for BOM marks (U+FEFF) in the input and adjust current
5410 byte order setting accordingly. In native mode, the leading BOM
5411 mark is skipped, in all other modes, it is copied to the output
5412 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005413 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005414 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005415 if (bom == 0x0000FEFF) {
5416 bo = -1;
5417 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005418 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005419 else if (bom == 0xFFFE0000) {
5420 bo = 1;
5421 q += 4;
5422 }
5423 if (byteorder)
5424 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005425 }
5426
Victor Stinnere64322e2012-10-30 23:12:47 +01005427 if (q == e) {
5428 if (consumed)
5429 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005430 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005431 }
5432
Victor Stinnere64322e2012-10-30 23:12:47 +01005433#ifdef WORDS_BIGENDIAN
5434 le = bo < 0;
5435#else
5436 le = bo <= 0;
5437#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005438 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005439
Victor Stinner8f674cc2013-04-17 23:02:17 +02005440 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005441 writer.min_length = (e - q + 3) / 4;
5442 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005443 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005444
Victor Stinnere64322e2012-10-30 23:12:47 +01005445 while (1) {
5446 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005447 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005448
Victor Stinnere64322e2012-10-30 23:12:47 +01005449 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005450 enum PyUnicode_Kind kind = writer.kind;
5451 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005452 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005453 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005454 if (le) {
5455 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005456 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005457 if (ch > maxch)
5458 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005459 if (kind != PyUnicode_1BYTE_KIND &&
5460 Py_UNICODE_IS_SURROGATE(ch))
5461 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005462 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005463 q += 4;
5464 } while (q <= last);
5465 }
5466 else {
5467 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005468 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005469 if (ch > maxch)
5470 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005471 if (kind != PyUnicode_1BYTE_KIND &&
5472 Py_UNICODE_IS_SURROGATE(ch))
5473 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005474 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005475 q += 4;
5476 } while (q <= last);
5477 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005478 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005479 }
5480
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005481 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005482 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005483 startinpos = ((const char *)q) - starts;
5484 endinpos = startinpos + 4;
5485 }
5486 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005487 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005488 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005489 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005490 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005491 startinpos = ((const char *)q) - starts;
5492 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005493 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005494 else {
5495 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005496 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005497 goto onError;
5498 q += 4;
5499 continue;
5500 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005501 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005502 startinpos = ((const char *)q) - starts;
5503 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005504 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005505
5506 /* The remaining input chars are ignored if the callback
5507 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005508 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005509 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005510 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005511 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005512 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005513 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005514 }
5515
Walter Dörwald41980ca2007-08-16 21:55:45 +00005516 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005517 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518
Walter Dörwald41980ca2007-08-16 21:55:45 +00005519 Py_XDECREF(errorHandler);
5520 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005521 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005522
Benjamin Peterson29060642009-01-31 22:14:21 +00005523 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005524 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005525 Py_XDECREF(errorHandler);
5526 Py_XDECREF(exc);
5527 return NULL;
5528}
5529
5530PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005531_PyUnicode_EncodeUTF32(PyObject *str,
5532 const char *errors,
5533 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005534{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005535 enum PyUnicode_Kind kind;
5536 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005537 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005538 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005539 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005540#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005541 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005542#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005543 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005544#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005545 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005546 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005547 PyObject *errorHandler = NULL;
5548 PyObject *exc = NULL;
5549 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005550
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005551 if (!PyUnicode_Check(str)) {
5552 PyErr_BadArgument();
5553 return NULL;
5554 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005555 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005556 return NULL;
5557 kind = PyUnicode_KIND(str);
5558 data = PyUnicode_DATA(str);
5559 len = PyUnicode_GET_LENGTH(str);
5560
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005561 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005562 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005563 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005564 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005565 if (v == NULL)
5566 return NULL;
5567
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005568 /* output buffer is 4-bytes aligned */
5569 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005570 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005571 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005572 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005573 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005574 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005575
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005576 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005577 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005578 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005579 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005580 else
5581 encoding = "utf-32";
5582
5583 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005584 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5585 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005586 }
5587
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005588 pos = 0;
5589 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005590 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005591
5592 if (kind == PyUnicode_2BYTE_KIND) {
5593 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5594 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005595 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005596 else {
5597 assert(kind == PyUnicode_4BYTE_KIND);
5598 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5599 &out, native_ordering);
5600 }
5601 if (pos == len)
5602 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005603
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005604 rep = unicode_encode_call_errorhandler(
5605 errors, &errorHandler,
5606 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005607 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005608 if (!rep)
5609 goto error;
5610
5611 if (PyBytes_Check(rep)) {
5612 repsize = PyBytes_GET_SIZE(rep);
5613 if (repsize & 3) {
5614 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005615 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005616 "surrogates not allowed");
5617 goto error;
5618 }
5619 moreunits = repsize / 4;
5620 }
5621 else {
5622 assert(PyUnicode_Check(rep));
5623 if (PyUnicode_READY(rep) < 0)
5624 goto error;
5625 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5626 if (!PyUnicode_IS_ASCII(rep)) {
5627 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005628 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005629 "surrogates not allowed");
5630 goto error;
5631 }
5632 }
5633
5634 /* four bytes are reserved for each surrogate */
5635 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005636 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005637 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005638 /* integer overflow */
5639 PyErr_NoMemory();
5640 goto error;
5641 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005642 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005643 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005644 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005645 }
5646
5647 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005648 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005649 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005650 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005651 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005652 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5653 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005654 }
5655
5656 Py_CLEAR(rep);
5657 }
5658
5659 /* Cut back to size actually needed. This is necessary for, for example,
5660 encoding of a string containing isolated surrogates and the 'ignore'
5661 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005662 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005663 if (nsize != PyBytes_GET_SIZE(v))
5664 _PyBytes_Resize(&v, nsize);
5665 Py_XDECREF(errorHandler);
5666 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005667 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005668 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005669 error:
5670 Py_XDECREF(rep);
5671 Py_XDECREF(errorHandler);
5672 Py_XDECREF(exc);
5673 Py_XDECREF(v);
5674 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005675}
5676
Alexander Belopolsky40018472011-02-26 01:02:56 +00005677PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005678PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5679 Py_ssize_t size,
5680 const char *errors,
5681 int byteorder)
5682{
5683 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005684 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005685 if (tmp == NULL)
5686 return NULL;
5687 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5688 Py_DECREF(tmp);
5689 return result;
5690}
5691
5692PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005693PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005694{
Victor Stinnerb960b342011-11-20 19:12:52 +01005695 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005696}
5697
Guido van Rossumd57fd912000-03-10 22:53:23 +00005698/* --- UTF-16 Codec ------------------------------------------------------- */
5699
Tim Peters772747b2001-08-09 22:21:55 +00005700PyObject *
5701PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005702 Py_ssize_t size,
5703 const char *errors,
5704 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005705{
Walter Dörwald69652032004-09-07 20:24:22 +00005706 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5707}
5708
5709PyObject *
5710PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005711 Py_ssize_t size,
5712 const char *errors,
5713 int *byteorder,
5714 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005715{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005716 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005717 Py_ssize_t startinpos;
5718 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005719 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005720 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005721 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005722 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005723 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005724 PyObject *errorHandler = NULL;
5725 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005727
Tim Peters772747b2001-08-09 22:21:55 +00005728 q = (unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005729 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005730
5731 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005732 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005733
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005734 /* Check for BOM marks (U+FEFF) in the input and adjust current
5735 byte order setting accordingly. In native mode, the leading BOM
5736 mark is skipped, in all other modes, it is copied to the output
5737 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005738 if (bo == 0 && size >= 2) {
5739 const Py_UCS4 bom = (q[1] << 8) | q[0];
5740 if (bom == 0xFEFF) {
5741 q += 2;
5742 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005743 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005744 else if (bom == 0xFFFE) {
5745 q += 2;
5746 bo = 1;
5747 }
5748 if (byteorder)
5749 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005750 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005751
Antoine Pitrou63065d72012-05-15 23:48:04 +02005752 if (q == e) {
5753 if (consumed)
5754 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005755 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005756 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005757
Christian Heimes743e0cd2012-10-17 23:52:17 +02005758#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005759 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005760 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005761#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005762 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005764#endif
Tim Peters772747b2001-08-09 22:21:55 +00005765
Antoine Pitrou63065d72012-05-15 23:48:04 +02005766 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005767 character count normally. Error handler will take care of
5768 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005769 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005770 writer.min_length = (e - q + 1) / 2;
5771 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005772 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005773
Antoine Pitrou63065d72012-05-15 23:48:04 +02005774 while (1) {
5775 Py_UCS4 ch = 0;
5776 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005777 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005778 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005779 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005780 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005781 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005782 native_ordering);
5783 else
5784 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005785 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005786 native_ordering);
5787 } else if (kind == PyUnicode_2BYTE_KIND) {
5788 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005789 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005790 native_ordering);
5791 } else {
5792 assert(kind == PyUnicode_4BYTE_KIND);
5793 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005794 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005795 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005796 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005797 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005798
Antoine Pitrou63065d72012-05-15 23:48:04 +02005799 switch (ch)
5800 {
5801 case 0:
5802 /* remaining byte at the end? (size should be even) */
5803 if (q == e || consumed)
5804 goto End;
5805 errmsg = "truncated data";
5806 startinpos = ((const char *)q) - starts;
5807 endinpos = ((const char *)e) - starts;
5808 break;
5809 /* The remaining input chars are ignored if the callback
5810 chooses to skip the input */
5811 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005812 q -= 2;
5813 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005814 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005815 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005816 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005817 endinpos = ((const char *)e) - starts;
5818 break;
5819 case 2:
5820 errmsg = "illegal encoding";
5821 startinpos = ((const char *)q) - 2 - starts;
5822 endinpos = startinpos + 2;
5823 break;
5824 case 3:
5825 errmsg = "illegal UTF-16 surrogate";
5826 startinpos = ((const char *)q) - 4 - starts;
5827 endinpos = startinpos + 2;
5828 break;
5829 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005830 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005831 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005832 continue;
5833 }
5834
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005835 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005836 errors,
5837 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005838 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005839 &starts,
5840 (const char **)&e,
5841 &startinpos,
5842 &endinpos,
5843 &exc,
5844 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005845 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005846 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005847 }
5848
Antoine Pitrou63065d72012-05-15 23:48:04 +02005849End:
Walter Dörwald69652032004-09-07 20:24:22 +00005850 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005851 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005852
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005853 Py_XDECREF(errorHandler);
5854 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005855 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005856
Benjamin Peterson29060642009-01-31 22:14:21 +00005857 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005858 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005859 Py_XDECREF(errorHandler);
5860 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005861 return NULL;
5862}
5863
Tim Peters772747b2001-08-09 22:21:55 +00005864PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005865_PyUnicode_EncodeUTF16(PyObject *str,
5866 const char *errors,
5867 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005868{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005869 enum PyUnicode_Kind kind;
5870 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005871 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005872 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005873 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005874 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005875#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005876 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005877#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005878 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005879#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005880 const char *encoding;
5881 Py_ssize_t nsize, pos;
5882 PyObject *errorHandler = NULL;
5883 PyObject *exc = NULL;
5884 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00005885
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005886 if (!PyUnicode_Check(str)) {
5887 PyErr_BadArgument();
5888 return NULL;
5889 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005890 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005891 return NULL;
5892 kind = PyUnicode_KIND(str);
5893 data = PyUnicode_DATA(str);
5894 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01005895
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005896 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005897 if (kind == PyUnicode_4BYTE_KIND) {
5898 const Py_UCS4 *in = (const Py_UCS4 *)data;
5899 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005900 while (in < end) {
5901 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005902 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005903 }
5904 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005905 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005906 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00005907 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005908 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005909 nsize = len + pairs + (byteorder == 0);
5910 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005911 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00005912 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005913 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005914
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005915 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02005916 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005917 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005918 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005919 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005920 }
5921 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00005922 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005923 }
Tim Peters772747b2001-08-09 22:21:55 +00005924
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005925 if (kind == PyUnicode_1BYTE_KIND) {
5926 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5927 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00005928 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00005929
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005930 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005931 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005932 }
5933 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005934 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005935 }
5936 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005937 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02005938 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005939
5940 pos = 0;
5941 while (pos < len) {
5942 Py_ssize_t repsize, moreunits;
5943
5944 if (kind == PyUnicode_2BYTE_KIND) {
5945 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5946 &out, native_ordering);
5947 }
5948 else {
5949 assert(kind == PyUnicode_4BYTE_KIND);
5950 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5951 &out, native_ordering);
5952 }
5953 if (pos == len)
5954 break;
5955
5956 rep = unicode_encode_call_errorhandler(
5957 errors, &errorHandler,
5958 encoding, "surrogates not allowed",
5959 str, &exc, pos, pos + 1, &pos);
5960 if (!rep)
5961 goto error;
5962
5963 if (PyBytes_Check(rep)) {
5964 repsize = PyBytes_GET_SIZE(rep);
5965 if (repsize & 1) {
5966 raise_encode_exception(&exc, encoding,
5967 str, pos - 1, pos,
5968 "surrogates not allowed");
5969 goto error;
5970 }
5971 moreunits = repsize / 2;
5972 }
5973 else {
5974 assert(PyUnicode_Check(rep));
5975 if (PyUnicode_READY(rep) < 0)
5976 goto error;
5977 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5978 if (!PyUnicode_IS_ASCII(rep)) {
5979 raise_encode_exception(&exc, encoding,
5980 str, pos - 1, pos,
5981 "surrogates not allowed");
5982 goto error;
5983 }
5984 }
5985
5986 /* two bytes are reserved for each surrogate */
5987 if (moreunits > 1) {
5988 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005989 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005990 /* integer overflow */
5991 PyErr_NoMemory();
5992 goto error;
5993 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005994 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005995 goto error;
5996 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5997 }
5998
5999 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006000 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006001 out += moreunits;
6002 } else /* rep is unicode */ {
6003 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6004 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6005 &out, native_ordering);
6006 }
6007
6008 Py_CLEAR(rep);
6009 }
6010
6011 /* Cut back to size actually needed. This is necessary for, for example,
6012 encoding of a string containing isolated surrogates and the 'ignore' handler
6013 is used. */
6014 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6015 if (nsize != PyBytes_GET_SIZE(v))
6016 _PyBytes_Resize(&v, nsize);
6017 Py_XDECREF(errorHandler);
6018 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006019 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006020 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006021 error:
6022 Py_XDECREF(rep);
6023 Py_XDECREF(errorHandler);
6024 Py_XDECREF(exc);
6025 Py_XDECREF(v);
6026 return NULL;
6027#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006028}
6029
Alexander Belopolsky40018472011-02-26 01:02:56 +00006030PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006031PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6032 Py_ssize_t size,
6033 const char *errors,
6034 int byteorder)
6035{
6036 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006037 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006038 if (tmp == NULL)
6039 return NULL;
6040 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6041 Py_DECREF(tmp);
6042 return result;
6043}
6044
6045PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006046PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006047{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006048 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006049}
6050
6051/* --- Unicode Escape Codec ----------------------------------------------- */
6052
Fredrik Lundh06d12682001-01-24 07:59:11 +00006053static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006054
Alexander Belopolsky40018472011-02-26 01:02:56 +00006055PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006056_PyUnicode_DecodeUnicodeEscape(const char *s,
6057 Py_ssize_t size,
6058 const char *errors,
6059 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006060{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006061 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006062 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006063 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006064 PyObject *errorHandler = NULL;
6065 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006066
Eric V. Smith42454af2016-10-31 09:22:08 -04006067 // so we can remember if we've seen an invalid escape char or not
6068 *first_invalid_escape = NULL;
6069
Victor Stinner62ec3312016-09-06 17:04:34 -07006070 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006071 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006072 }
6073 /* Escaped strings will always be longer than the resulting
6074 Unicode string, so we start with size here and then reduce the
6075 length after conversion to the true value.
6076 (but if the error callback returns a long replacement string
6077 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006078 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006079 writer.min_length = size;
6080 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6081 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006082 }
6083
Guido van Rossumd57fd912000-03-10 22:53:23 +00006084 end = s + size;
6085 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006086 unsigned char c = (unsigned char) *s++;
6087 Py_UCS4 ch;
6088 int count;
6089 Py_ssize_t startinpos;
6090 Py_ssize_t endinpos;
6091 const char *message;
6092
6093#define WRITE_ASCII_CHAR(ch) \
6094 do { \
6095 assert(ch <= 127); \
6096 assert(writer.pos < writer.size); \
6097 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6098 } while(0)
6099
6100#define WRITE_CHAR(ch) \
6101 do { \
6102 if (ch <= writer.maxchar) { \
6103 assert(writer.pos < writer.size); \
6104 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6105 } \
6106 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6107 goto onError; \
6108 } \
6109 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006110
6111 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006112 if (c != '\\') {
6113 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006114 continue;
6115 }
6116
Victor Stinner62ec3312016-09-06 17:04:34 -07006117 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006118 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006119 if (s >= end) {
6120 message = "\\ at end of string";
6121 goto error;
6122 }
6123 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006124
Victor Stinner62ec3312016-09-06 17:04:34 -07006125 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006126 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006127
Benjamin Peterson29060642009-01-31 22:14:21 +00006128 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006129 case '\n': continue;
6130 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6131 case '\'': WRITE_ASCII_CHAR('\''); continue;
6132 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6133 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006134 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006135 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6136 case 't': WRITE_ASCII_CHAR('\t'); continue;
6137 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6138 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006139 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006140 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006141 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006142 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006143
Benjamin Peterson29060642009-01-31 22:14:21 +00006144 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006145 case '0': case '1': case '2': case '3':
6146 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006147 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006148 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006149 ch = (ch<<3) + *s++ - '0';
6150 if (s < end && '0' <= *s && *s <= '7') {
6151 ch = (ch<<3) + *s++ - '0';
6152 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006153 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006154 WRITE_CHAR(ch);
6155 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006156
Benjamin Peterson29060642009-01-31 22:14:21 +00006157 /* hex escapes */
6158 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006159 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006160 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006161 message = "truncated \\xXX escape";
6162 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006163
Benjamin Peterson29060642009-01-31 22:14:21 +00006164 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006166 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006167 message = "truncated \\uXXXX escape";
6168 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006169
Benjamin Peterson29060642009-01-31 22:14:21 +00006170 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006171 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006172 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006173 message = "truncated \\UXXXXXXXX escape";
6174 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006175 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006176 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006177 ch <<= 4;
6178 if (c >= '0' && c <= '9') {
6179 ch += c - '0';
6180 }
6181 else if (c >= 'a' && c <= 'f') {
6182 ch += c - ('a' - 10);
6183 }
6184 else if (c >= 'A' && c <= 'F') {
6185 ch += c - ('A' - 10);
6186 }
6187 else {
6188 break;
6189 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006190 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006191 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006192 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006193 }
6194
6195 /* when we get here, ch is a 32-bit unicode character */
6196 if (ch > MAX_UNICODE) {
6197 message = "illegal Unicode character";
6198 goto error;
6199 }
6200
6201 WRITE_CHAR(ch);
6202 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006203
Benjamin Peterson29060642009-01-31 22:14:21 +00006204 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006205 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006206 if (ucnhash_CAPI == NULL) {
6207 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006208 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6209 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006210 if (ucnhash_CAPI == NULL) {
6211 PyErr_SetString(
6212 PyExc_UnicodeError,
6213 "\\N escapes not supported (can't load unicodedata module)"
6214 );
6215 goto onError;
6216 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006217 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006218
6219 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006220 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006221 const char *start = ++s;
6222 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006223 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006224 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006225 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006226 namelen = s - start;
6227 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006228 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006229 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 ch = 0xffffffff; /* in case 'getcode' messes up */
6231 if (namelen <= INT_MAX &&
6232 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6233 &ch, 0)) {
6234 assert(ch <= MAX_UNICODE);
6235 WRITE_CHAR(ch);
6236 continue;
6237 }
6238 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006239 }
6240 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006241 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006242
6243 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006244 if (*first_invalid_escape == NULL) {
6245 *first_invalid_escape = s-1; /* Back up one char, since we've
6246 already incremented s. */
6247 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006248 WRITE_ASCII_CHAR('\\');
6249 WRITE_CHAR(c);
6250 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006251 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006252
6253 error:
6254 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006255 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006256 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006257 errors, &errorHandler,
6258 "unicodeescape", message,
6259 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006261 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006262 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006263 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006264
6265#undef WRITE_ASCII_CHAR
6266#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006267 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006268
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006269 Py_XDECREF(errorHandler);
6270 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006271 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006272
Benjamin Peterson29060642009-01-31 22:14:21 +00006273 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006274 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006275 Py_XDECREF(errorHandler);
6276 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 return NULL;
6278}
6279
Eric V. Smith42454af2016-10-31 09:22:08 -04006280PyObject *
6281PyUnicode_DecodeUnicodeEscape(const char *s,
6282 Py_ssize_t size,
6283 const char *errors)
6284{
6285 const char *first_invalid_escape;
6286 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6287 &first_invalid_escape);
6288 if (result == NULL)
6289 return NULL;
6290 if (first_invalid_escape != NULL) {
6291 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6292 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006293 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006294 Py_DECREF(result);
6295 return NULL;
6296 }
6297 }
6298 return result;
6299}
6300
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006301/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006302
Alexander Belopolsky40018472011-02-26 01:02:56 +00006303PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006304PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006305{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006306 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006307 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006308 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006309 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006310 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006312
Ezio Melottie7f90372012-10-05 03:33:31 +03006313 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006314 escape.
6315
Ezio Melottie7f90372012-10-05 03:33:31 +03006316 For UCS1 strings it's '\xxx', 4 bytes per source character.
6317 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6318 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006319 */
6320
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006321 if (!PyUnicode_Check(unicode)) {
6322 PyErr_BadArgument();
6323 return NULL;
6324 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006325 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006326 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006327 }
Victor Stinner358af132015-10-12 22:36:57 +02006328
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006329 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006330 if (len == 0) {
6331 return PyBytes_FromStringAndSize(NULL, 0);
6332 }
6333
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006334 kind = PyUnicode_KIND(unicode);
6335 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006336 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6337 bytes, and 1 byte characters 4. */
6338 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006339 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006340 return PyErr_NoMemory();
6341 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006342 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006343 if (repr == NULL) {
6344 return NULL;
6345 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006346
Victor Stinner62ec3312016-09-06 17:04:34 -07006347 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006348 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006349 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006350
Victor Stinner62ec3312016-09-06 17:04:34 -07006351 /* U+0000-U+00ff range */
6352 if (ch < 0x100) {
6353 if (ch >= ' ' && ch < 127) {
6354 if (ch != '\\') {
6355 /* Copy printable US ASCII as-is */
6356 *p++ = (char) ch;
6357 }
6358 /* Escape backslashes */
6359 else {
6360 *p++ = '\\';
6361 *p++ = '\\';
6362 }
6363 }
Victor Stinner358af132015-10-12 22:36:57 +02006364
Victor Stinner62ec3312016-09-06 17:04:34 -07006365 /* Map special whitespace to '\t', \n', '\r' */
6366 else if (ch == '\t') {
6367 *p++ = '\\';
6368 *p++ = 't';
6369 }
6370 else if (ch == '\n') {
6371 *p++ = '\\';
6372 *p++ = 'n';
6373 }
6374 else if (ch == '\r') {
6375 *p++ = '\\';
6376 *p++ = 'r';
6377 }
6378
6379 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6380 else {
6381 *p++ = '\\';
6382 *p++ = 'x';
6383 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6384 *p++ = Py_hexdigits[ch & 0x000F];
6385 }
Tim Petersced69f82003-09-16 20:30:58 +00006386 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006387 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006388 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006389 *p++ = '\\';
6390 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006391 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6392 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6393 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6394 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006396 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6397 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006398
Victor Stinner62ec3312016-09-06 17:04:34 -07006399 /* Make sure that the first two digits are zero */
6400 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006401 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006402 *p++ = 'U';
6403 *p++ = '0';
6404 *p++ = '0';
6405 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6406 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6407 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6408 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6409 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6410 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006411 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006412 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006413
Victor Stinner62ec3312016-09-06 17:04:34 -07006414 assert(p - PyBytes_AS_STRING(repr) > 0);
6415 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6416 return NULL;
6417 }
6418 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006419}
6420
Alexander Belopolsky40018472011-02-26 01:02:56 +00006421PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006422PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6423 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006424{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006425 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006426 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006428 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 }
6430
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006431 result = PyUnicode_AsUnicodeEscapeString(tmp);
6432 Py_DECREF(tmp);
6433 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006434}
6435
6436/* --- Raw Unicode Escape Codec ------------------------------------------- */
6437
Alexander Belopolsky40018472011-02-26 01:02:56 +00006438PyObject *
6439PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006440 Py_ssize_t size,
6441 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006442{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006443 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006444 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006445 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006446 PyObject *errorHandler = NULL;
6447 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006448
Victor Stinner62ec3312016-09-06 17:04:34 -07006449 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006450 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006451 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006452
Guido van Rossumd57fd912000-03-10 22:53:23 +00006453 /* Escaped strings will always be longer than the resulting
6454 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006455 length after conversion to the true value. (But decoding error
6456 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006457 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 writer.min_length = size;
6459 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6460 goto onError;
6461 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006462
Guido van Rossumd57fd912000-03-10 22:53:23 +00006463 end = s + size;
6464 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 unsigned char c = (unsigned char) *s++;
6466 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006467 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006468 Py_ssize_t startinpos;
6469 Py_ssize_t endinpos;
6470 const char *message;
6471
6472#define WRITE_CHAR(ch) \
6473 do { \
6474 if (ch <= writer.maxchar) { \
6475 assert(writer.pos < writer.size); \
6476 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6477 } \
6478 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6479 goto onError; \
6480 } \
6481 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006482
Benjamin Peterson29060642009-01-31 22:14:21 +00006483 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006484 if (c != '\\' || s >= end) {
6485 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006486 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006487 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006488
Victor Stinner62ec3312016-09-06 17:04:34 -07006489 c = (unsigned char) *s++;
6490 if (c == 'u') {
6491 count = 4;
6492 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006493 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006494 else if (c == 'U') {
6495 count = 8;
6496 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006497 }
6498 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006499 assert(writer.pos < writer.size);
6500 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6501 WRITE_CHAR(c);
6502 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006503 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006504 startinpos = s - starts - 2;
6505
6506 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6507 for (ch = 0; count && s < end; ++s, --count) {
6508 c = (unsigned char)*s;
6509 ch <<= 4;
6510 if (c >= '0' && c <= '9') {
6511 ch += c - '0';
6512 }
6513 else if (c >= 'a' && c <= 'f') {
6514 ch += c - ('a' - 10);
6515 }
6516 else if (c >= 'A' && c <= 'F') {
6517 ch += c - ('A' - 10);
6518 }
6519 else {
6520 break;
6521 }
6522 }
6523 if (!count) {
6524 if (ch <= MAX_UNICODE) {
6525 WRITE_CHAR(ch);
6526 continue;
6527 }
6528 message = "\\Uxxxxxxxx out of range";
6529 }
6530
6531 endinpos = s-starts;
6532 writer.min_length = end - s + writer.pos;
6533 if (unicode_decode_call_errorhandler_writer(
6534 errors, &errorHandler,
6535 "rawunicodeescape", message,
6536 &starts, &end, &startinpos, &endinpos, &exc, &s,
6537 &writer)) {
6538 goto onError;
6539 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006540 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006541
6542#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006543 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006544 Py_XDECREF(errorHandler);
6545 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006546 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006547
Benjamin Peterson29060642009-01-31 22:14:21 +00006548 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006549 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006550 Py_XDECREF(errorHandler);
6551 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006553
Guido van Rossumd57fd912000-03-10 22:53:23 +00006554}
6555
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006556
Alexander Belopolsky40018472011-02-26 01:02:56 +00006557PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006558PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006559{
Victor Stinner62ec3312016-09-06 17:04:34 -07006560 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006561 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006562 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006563 int kind;
6564 void *data;
6565 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006566
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006567 if (!PyUnicode_Check(unicode)) {
6568 PyErr_BadArgument();
6569 return NULL;
6570 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006571 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006572 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006573 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006574 kind = PyUnicode_KIND(unicode);
6575 data = PyUnicode_DATA(unicode);
6576 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 if (kind == PyUnicode_1BYTE_KIND) {
6578 return PyBytes_FromStringAndSize(data, len);
6579 }
Victor Stinner0e368262011-11-10 20:12:49 +01006580
Victor Stinner62ec3312016-09-06 17:04:34 -07006581 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6582 bytes, and 1 byte characters 4. */
6583 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006584
Victor Stinner62ec3312016-09-06 17:04:34 -07006585 if (len > PY_SSIZE_T_MAX / expandsize) {
6586 return PyErr_NoMemory();
6587 }
6588 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6589 if (repr == NULL) {
6590 return NULL;
6591 }
6592 if (len == 0) {
6593 return repr;
6594 }
6595
6596 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006597 for (pos = 0; pos < len; pos++) {
6598 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006599
Victor Stinner62ec3312016-09-06 17:04:34 -07006600 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6601 if (ch < 0x100) {
6602 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006603 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006604 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006605 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006606 *p++ = '\\';
6607 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006608 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6609 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6610 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6611 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006612 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006613 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6614 else {
6615 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6616 *p++ = '\\';
6617 *p++ = 'U';
6618 *p++ = '0';
6619 *p++ = '0';
6620 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6621 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6622 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6623 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6624 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6625 *p++ = Py_hexdigits[ch & 15];
6626 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006627 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006628
Victor Stinner62ec3312016-09-06 17:04:34 -07006629 assert(p > PyBytes_AS_STRING(repr));
6630 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6631 return NULL;
6632 }
6633 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006634}
6635
Alexander Belopolsky40018472011-02-26 01:02:56 +00006636PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006637PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6638 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006639{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006640 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006641 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006642 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006643 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006644 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6645 Py_DECREF(tmp);
6646 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006647}
6648
6649/* --- Latin-1 Codec ------------------------------------------------------ */
6650
Alexander Belopolsky40018472011-02-26 01:02:56 +00006651PyObject *
6652PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006653 Py_ssize_t size,
6654 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006655{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006656 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Victor Stinnere57b1c02011-09-28 22:20:48 +02006657 return _PyUnicode_FromUCS1((unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006658}
6659
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006660/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006661static void
6662make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006663 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006664 PyObject *unicode,
6665 Py_ssize_t startpos, Py_ssize_t endpos,
6666 const char *reason)
6667{
6668 if (*exceptionObject == NULL) {
6669 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006670 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006671 encoding, unicode, startpos, endpos, reason);
6672 }
6673 else {
6674 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6675 goto onError;
6676 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6677 goto onError;
6678 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6679 goto onError;
6680 return;
6681 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006682 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006683 }
6684}
6685
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006686/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006687static void
6688raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006689 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006690 PyObject *unicode,
6691 Py_ssize_t startpos, Py_ssize_t endpos,
6692 const char *reason)
6693{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006694 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006695 encoding, unicode, startpos, endpos, reason);
6696 if (*exceptionObject != NULL)
6697 PyCodec_StrictErrors(*exceptionObject);
6698}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006699
6700/* error handling callback helper:
6701 build arguments, call the callback and check the arguments,
6702 put the result into newpos and return the replacement string, which
6703 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006704static PyObject *
6705unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006706 PyObject **errorHandler,
6707 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006708 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006709 Py_ssize_t startpos, Py_ssize_t endpos,
6710 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006711{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006712 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006713 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006714 PyObject *restuple;
6715 PyObject *resunicode;
6716
6717 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006718 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006719 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006720 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006721 }
6722
Benjamin Petersonbac79492012-01-14 13:34:47 -05006723 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006724 return NULL;
6725 len = PyUnicode_GET_LENGTH(unicode);
6726
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006727 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006728 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006729 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006730 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006731
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01006732 restuple = PyObject_CallFunctionObjArgs(
6733 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006734 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006735 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006736 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006737 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006738 Py_DECREF(restuple);
6739 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006740 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006741 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006742 &resunicode, newpos)) {
6743 Py_DECREF(restuple);
6744 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006745 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006746 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6747 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6748 Py_DECREF(restuple);
6749 return NULL;
6750 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006751 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006752 *newpos = len + *newpos;
6753 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006754 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006755 Py_DECREF(restuple);
6756 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006757 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006758 Py_INCREF(resunicode);
6759 Py_DECREF(restuple);
6760 return resunicode;
6761}
6762
Alexander Belopolsky40018472011-02-26 01:02:56 +00006763static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006764unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006765 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006766 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006767{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006768 /* input state */
6769 Py_ssize_t pos=0, size;
6770 int kind;
6771 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006772 /* pointer into the output */
6773 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006774 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6775 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006776 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006777 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006778 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006779 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006780 /* output object */
6781 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006782
Benjamin Petersonbac79492012-01-14 13:34:47 -05006783 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006784 return NULL;
6785 size = PyUnicode_GET_LENGTH(unicode);
6786 kind = PyUnicode_KIND(unicode);
6787 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006788 /* allocate enough for a simple encoding without
6789 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006790 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006791 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006792
6793 _PyBytesWriter_Init(&writer);
6794 str = _PyBytesWriter_Alloc(&writer, size);
6795 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006796 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006797
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006798 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006799 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006800
Benjamin Peterson29060642009-01-31 22:14:21 +00006801 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006802 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006803 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006804 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006805 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006806 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006807 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006808 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006809 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006810 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006811 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006812 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006813
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006814 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006815 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006816
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006817 /* Only overallocate the buffer if it's not the last write */
6818 writer.overallocate = (collend < size);
6819
Benjamin Peterson29060642009-01-31 22:14:21 +00006820 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006821 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006822 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006823
6824 switch (error_handler) {
6825 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006826 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006827 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006828
6829 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006830 memset(str, '?', collend - collstart);
6831 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006832 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006833 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006834 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006835 break;
Victor Stinner50149202015-09-22 00:26:54 +02006836
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006837 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006838 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006839 writer.min_size -= (collend - collstart);
6840 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006841 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006842 if (str == NULL)
6843 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006844 pos = collend;
6845 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006846
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006847 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006848 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006849 writer.min_size -= (collend - collstart);
6850 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006851 unicode, collstart, collend);
6852 if (str == NULL)
6853 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006854 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 break;
Victor Stinner50149202015-09-22 00:26:54 +02006856
Victor Stinnerc3713e92015-09-29 12:32:13 +02006857 case _Py_ERROR_SURROGATEESCAPE:
6858 for (i = collstart; i < collend; ++i) {
6859 ch = PyUnicode_READ(kind, data, i);
6860 if (ch < 0xdc80 || 0xdcff < ch) {
6861 /* Not a UTF-8b surrogate */
6862 break;
6863 }
6864 *str++ = (char)(ch - 0xdc00);
6865 ++pos;
6866 }
6867 if (i >= collend)
6868 break;
6869 collstart = pos;
6870 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006871 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006872
Benjamin Peterson29060642009-01-31 22:14:21 +00006873 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006874 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6875 encoding, reason, unicode, &exc,
6876 collstart, collend, &newpos);
6877 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006878 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006879
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006880 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006881 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006882
Victor Stinner6bd525b2015-10-09 13:10:05 +02006883 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00006884 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02006885 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02006886 PyBytes_AS_STRING(rep),
6887 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006888 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02006889 else {
6890 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02006891
Victor Stinner6bd525b2015-10-09 13:10:05 +02006892 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00006893 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006894
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006895 if (limit == 256 ?
6896 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
6897 !PyUnicode_IS_ASCII(rep))
6898 {
6899 /* Not all characters are smaller than limit */
6900 raise_encode_exception(&exc, encoding, unicode,
6901 collstart, collend, reason);
6902 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00006903 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02006904 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6905 str = _PyBytesWriter_WriteBytes(&writer, str,
6906 PyUnicode_DATA(rep),
6907 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00006908 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03006909 if (str == NULL)
6910 goto onError;
6911
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006912 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006913 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006914 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006915
6916 /* If overallocation was disabled, ensure that it was the last
6917 write. Otherwise, we missed an optimization */
6918 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00006919 }
6920 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006921
Victor Stinner50149202015-09-22 00:26:54 +02006922 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006923 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006924 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006925
6926 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006927 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006928 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02006929 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006930 Py_XDECREF(exc);
6931 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006932}
6933
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006934/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006935PyObject *
6936PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006937 Py_ssize_t size,
6938 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006939{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006940 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006941 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006942 if (unicode == NULL)
6943 return NULL;
6944 result = unicode_encode_ucs1(unicode, errors, 256);
6945 Py_DECREF(unicode);
6946 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006947}
6948
Alexander Belopolsky40018472011-02-26 01:02:56 +00006949PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006950_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006951{
6952 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006953 PyErr_BadArgument();
6954 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006955 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006956 if (PyUnicode_READY(unicode) == -1)
6957 return NULL;
6958 /* Fast path: if it is a one-byte string, construct
6959 bytes object directly. */
6960 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6961 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6962 PyUnicode_GET_LENGTH(unicode));
6963 /* Non-Latin-1 characters present. Defer to above function to
6964 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006965 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006966}
6967
6968PyObject*
6969PyUnicode_AsLatin1String(PyObject *unicode)
6970{
6971 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006972}
6973
6974/* --- 7-bit ASCII Codec -------------------------------------------------- */
6975
Alexander Belopolsky40018472011-02-26 01:02:56 +00006976PyObject *
6977PyUnicode_DecodeASCII(const char *s,
6978 Py_ssize_t size,
6979 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006980{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006981 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006982 _PyUnicodeWriter writer;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01006983 int kind;
6984 void *data;
Martin v. Löwis18e16552006-02-15 17:27:45 +00006985 Py_ssize_t startinpos;
6986 Py_ssize_t endinpos;
6987 Py_ssize_t outpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006988 const char *e;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006989 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006990 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02006991 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00006992
Guido van Rossumd57fd912000-03-10 22:53:23 +00006993 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02006994 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01006995
Guido van Rossumd57fd912000-03-10 22:53:23 +00006996 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02006997 if (size == 1 && (unsigned char)s[0] < 128)
6998 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006999
Victor Stinner8f674cc2013-04-17 23:02:17 +02007000 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02007001 writer.min_length = size;
7002 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
Victor Stinner8f674cc2013-04-17 23:02:17 +02007003 return NULL;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007004
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007005 e = s + size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007006 data = writer.data;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007007 outpos = ascii_decode(s, e, (Py_UCS1 *)data);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007008 writer.pos = outpos;
7009 if (writer.pos == size)
7010 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007011
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007012 s += writer.pos;
7013 kind = writer.kind;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007014 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007015 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007016 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007017 PyUnicode_WRITE(kind, data, writer.pos, c);
7018 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007019 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007020 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007021 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007022
7023 /* byte outsize range 0x00..0x7f: call the error handler */
7024
7025 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007026 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007027
7028 switch (error_handler)
7029 {
7030 case _Py_ERROR_REPLACE:
7031 case _Py_ERROR_SURROGATEESCAPE:
7032 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007033 but we may switch to UCS2 at the first write */
7034 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7035 goto onError;
7036 kind = writer.kind;
7037 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007038
7039 if (error_handler == _Py_ERROR_REPLACE)
7040 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7041 else
7042 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7043 writer.pos++;
7044 ++s;
7045 break;
7046
7047 case _Py_ERROR_IGNORE:
7048 ++s;
7049 break;
7050
7051 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007052 startinpos = s-starts;
7053 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007054 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007055 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007056 "ascii", "ordinal not in range(128)",
7057 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007058 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007059 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007060 kind = writer.kind;
7061 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007062 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007063 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007064 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007065 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007066 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007067
Benjamin Peterson29060642009-01-31 22:14:21 +00007068 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007069 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007070 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007071 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 return NULL;
7073}
7074
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007075/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007076PyObject *
7077PyUnicode_EncodeASCII(const Py_UNICODE *p,
7078 Py_ssize_t size,
7079 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007080{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007081 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007082 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007083 if (unicode == NULL)
7084 return NULL;
7085 result = unicode_encode_ucs1(unicode, errors, 128);
7086 Py_DECREF(unicode);
7087 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007088}
7089
Alexander Belopolsky40018472011-02-26 01:02:56 +00007090PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007091_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007092{
7093 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007094 PyErr_BadArgument();
7095 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007096 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007097 if (PyUnicode_READY(unicode) == -1)
7098 return NULL;
7099 /* Fast path: if it is an ASCII-only string, construct bytes object
7100 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007101 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007102 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7103 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007104 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007105}
7106
7107PyObject *
7108PyUnicode_AsASCIIString(PyObject *unicode)
7109{
7110 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007111}
7112
Steve Dowercc16be82016-09-08 10:35:16 -07007113#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007114
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007115/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007116
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007117#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007118#define NEED_RETRY
7119#endif
7120
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07007121/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7122 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7123 both cases also and avoids partial characters overrunning the
7124 length limit in MultiByteToWideChar on Windows */
7125#define DECODING_CHUNK_SIZE (INT_MAX/4)
7126
Victor Stinner3a50e702011-10-18 21:21:00 +02007127#ifndef WC_ERR_INVALID_CHARS
7128# define WC_ERR_INVALID_CHARS 0x0080
7129#endif
7130
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007131static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007132code_page_name(UINT code_page, PyObject **obj)
7133{
7134 *obj = NULL;
7135 if (code_page == CP_ACP)
7136 return "mbcs";
7137 if (code_page == CP_UTF7)
7138 return "CP_UTF7";
7139 if (code_page == CP_UTF8)
7140 return "CP_UTF8";
7141
7142 *obj = PyBytes_FromFormat("cp%u", code_page);
7143 if (*obj == NULL)
7144 return NULL;
7145 return PyBytes_AS_STRING(*obj);
7146}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007147
Victor Stinner3a50e702011-10-18 21:21:00 +02007148static DWORD
7149decode_code_page_flags(UINT code_page)
7150{
7151 if (code_page == CP_UTF7) {
7152 /* The CP_UTF7 decoder only supports flags=0 */
7153 return 0;
7154 }
7155 else
7156 return MB_ERR_INVALID_CHARS;
7157}
7158
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007159/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007160 * Decode a byte string from a Windows code page into unicode object in strict
7161 * mode.
7162 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007163 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7164 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007165 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007166static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007167decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007168 wchar_t **buf,
7169 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007170 const char *in,
7171 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007172{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007173 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007174 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007175 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007176
7177 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007178 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007179 while ((outsize = MultiByteToWideChar(code_page, flags,
7180 in, insize, NULL, 0)) <= 0)
7181 {
7182 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7183 goto error;
7184 }
7185 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7186 flags = 0;
7187 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007188
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007189 /* Extend a wchar_t* buffer */
7190 Py_ssize_t n = *bufsize; /* Get the current length */
7191 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7192 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007193 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007194 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007195
7196 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007197 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7198 if (outsize <= 0)
7199 goto error;
7200 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007201
Victor Stinner3a50e702011-10-18 21:21:00 +02007202error:
7203 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7204 return -2;
7205 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007206 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007207}
7208
Victor Stinner3a50e702011-10-18 21:21:00 +02007209/*
7210 * Decode a byte string from a code page into unicode object with an error
7211 * handler.
7212 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007213 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007214 * UnicodeDecodeError exception and returns -1 on error.
7215 */
7216static int
7217decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007218 wchar_t **buf,
7219 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007220 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007221 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007222{
7223 const char *startin = in;
7224 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007225 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007226 /* Ideally, we should get reason from FormatMessage. This is the Windows
7227 2000 English version of the message. */
7228 const char *reason = "No mapping for the Unicode character exists "
7229 "in the target code page.";
7230 /* each step cannot decode more than 1 character, but a character can be
7231 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007232 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007233 int insize;
7234 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007235 PyObject *errorHandler = NULL;
7236 PyObject *exc = NULL;
7237 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007238 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007239 DWORD err;
7240 int ret = -1;
7241
7242 assert(size > 0);
7243
7244 encoding = code_page_name(code_page, &encoding_obj);
7245 if (encoding == NULL)
7246 return -1;
7247
Victor Stinner7d00cc12014-03-17 23:08:06 +01007248 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007249 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7250 UnicodeDecodeError. */
7251 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7252 if (exc != NULL) {
7253 PyCodec_StrictErrors(exc);
7254 Py_CLEAR(exc);
7255 }
7256 goto error;
7257 }
7258
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007259 /* Extend a wchar_t* buffer */
7260 Py_ssize_t n = *bufsize; /* Get the current length */
7261 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7262 PyErr_NoMemory();
7263 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007264 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007265 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7266 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007267 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007268 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007269
7270 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007271 while (in < endin)
7272 {
7273 /* Decode a character */
7274 insize = 1;
7275 do
7276 {
7277 outsize = MultiByteToWideChar(code_page, flags,
7278 in, insize,
7279 buffer, Py_ARRAY_LENGTH(buffer));
7280 if (outsize > 0)
7281 break;
7282 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007283 if (err == ERROR_INVALID_FLAGS && flags) {
7284 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7285 flags = 0;
7286 continue;
7287 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007288 if (err != ERROR_NO_UNICODE_TRANSLATION
7289 && err != ERROR_INSUFFICIENT_BUFFER)
7290 {
7291 PyErr_SetFromWindowsErr(0);
7292 goto error;
7293 }
7294 insize++;
7295 }
7296 /* 4=maximum length of a UTF-8 sequence */
7297 while (insize <= 4 && (in + insize) <= endin);
7298
7299 if (outsize <= 0) {
7300 Py_ssize_t startinpos, endinpos, outpos;
7301
Victor Stinner7d00cc12014-03-17 23:08:06 +01007302 /* last character in partial decode? */
7303 if (in + insize >= endin && !final)
7304 break;
7305
Victor Stinner3a50e702011-10-18 21:21:00 +02007306 startinpos = in - startin;
7307 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007308 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007309 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007310 errors, &errorHandler,
7311 encoding, reason,
7312 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007313 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007314 {
7315 goto error;
7316 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007317 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007318 }
7319 else {
7320 in += insize;
7321 memcpy(out, buffer, outsize * sizeof(wchar_t));
7322 out += outsize;
7323 }
7324 }
7325
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007326 /* Shrink the buffer */
7327 assert(out - *buf <= *bufsize);
7328 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007329 /* (in - startin) <= size and size is an int */
7330 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007331
7332error:
7333 Py_XDECREF(encoding_obj);
7334 Py_XDECREF(errorHandler);
7335 Py_XDECREF(exc);
7336 return ret;
7337}
7338
Victor Stinner3a50e702011-10-18 21:21:00 +02007339static PyObject *
7340decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007341 const char *s, Py_ssize_t size,
7342 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007343{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007344 wchar_t *buf = NULL;
7345 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007346 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007347
Victor Stinner3a50e702011-10-18 21:21:00 +02007348 if (code_page < 0) {
7349 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7350 return NULL;
7351 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007352 if (size < 0) {
7353 PyErr_BadInternalCall();
7354 return NULL;
7355 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007356
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007357 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007358 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007359
Victor Stinner76a31a62011-11-04 00:05:13 +01007360 do
7361 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007362#ifdef NEED_RETRY
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07007363 if (size > DECODING_CHUNK_SIZE) {
7364 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007365 final = 0;
7366 done = 0;
7367 }
7368 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007369#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007370 {
7371 chunk_size = (int)size;
7372 final = (consumed == NULL);
7373 done = 1;
7374 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007375
Victor Stinner76a31a62011-11-04 00:05:13 +01007376 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007377 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007378 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007379 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007380 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007381
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007382 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007383 s, chunk_size);
7384 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007385 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007386 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007387 errors, final);
7388 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007389
7390 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007391 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007392 return NULL;
7393 }
7394
7395 if (consumed)
7396 *consumed += converted;
7397
7398 s += converted;
7399 size -= converted;
7400 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007401
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007402 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7403 PyMem_Free(buf);
7404 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007405}
7406
Alexander Belopolsky40018472011-02-26 01:02:56 +00007407PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007408PyUnicode_DecodeCodePageStateful(int code_page,
7409 const char *s,
7410 Py_ssize_t size,
7411 const char *errors,
7412 Py_ssize_t *consumed)
7413{
7414 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7415}
7416
7417PyObject *
7418PyUnicode_DecodeMBCSStateful(const char *s,
7419 Py_ssize_t size,
7420 const char *errors,
7421 Py_ssize_t *consumed)
7422{
7423 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7424}
7425
7426PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007427PyUnicode_DecodeMBCS(const char *s,
7428 Py_ssize_t size,
7429 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007430{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007431 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7432}
7433
Victor Stinner3a50e702011-10-18 21:21:00 +02007434static DWORD
7435encode_code_page_flags(UINT code_page, const char *errors)
7436{
7437 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007438 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007439 }
7440 else if (code_page == CP_UTF7) {
7441 /* CP_UTF7 only supports flags=0 */
7442 return 0;
7443 }
7444 else {
7445 if (errors != NULL && strcmp(errors, "replace") == 0)
7446 return 0;
7447 else
7448 return WC_NO_BEST_FIT_CHARS;
7449 }
7450}
7451
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007452/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007453 * Encode a Unicode string to a Windows code page into a byte string in strict
7454 * mode.
7455 *
7456 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007457 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007458 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007459static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007460encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007461 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007462 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007463{
Victor Stinner554f3f02010-06-16 23:33:54 +00007464 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007465 BOOL *pusedDefaultChar = &usedDefaultChar;
7466 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007467 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007468 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007469 const DWORD flags = encode_code_page_flags(code_page, NULL);
7470 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007471 /* Create a substring so that we can get the UTF-16 representation
7472 of just the slice under consideration. */
7473 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007474
Martin v. Löwis3d325192011-11-04 18:23:06 +01007475 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007476
Victor Stinner3a50e702011-10-18 21:21:00 +02007477 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007478 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007479 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007480 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007481
Victor Stinner2fc507f2011-11-04 20:06:39 +01007482 substring = PyUnicode_Substring(unicode, offset, offset+len);
7483 if (substring == NULL)
7484 return -1;
7485 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7486 if (p == NULL) {
7487 Py_DECREF(substring);
7488 return -1;
7489 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007490 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007491
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007492 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007493 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007494 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007495 NULL, 0,
7496 NULL, pusedDefaultChar);
7497 if (outsize <= 0)
7498 goto error;
7499 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007500 if (pusedDefaultChar && *pusedDefaultChar) {
7501 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007502 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007503 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007504
Victor Stinner3a50e702011-10-18 21:21:00 +02007505 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007506 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007507 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007508 if (*outbytes == NULL) {
7509 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007510 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007511 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007512 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007513 }
7514 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007515 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007516 const Py_ssize_t n = PyBytes_Size(*outbytes);
7517 if (outsize > PY_SSIZE_T_MAX - n) {
7518 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007519 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007520 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007521 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007522 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7523 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007524 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007525 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007526 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007527 }
7528
7529 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007530 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007531 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007532 out, outsize,
7533 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007534 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007535 if (outsize <= 0)
7536 goto error;
7537 if (pusedDefaultChar && *pusedDefaultChar)
7538 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007539 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007540
Victor Stinner3a50e702011-10-18 21:21:00 +02007541error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007542 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007543 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7544 return -2;
7545 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007546 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007547}
7548
Victor Stinner3a50e702011-10-18 21:21:00 +02007549/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007550 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007551 * error handler.
7552 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007553 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007554 * -1 on other error.
7555 */
7556static int
7557encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007558 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007559 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007560{
Victor Stinner3a50e702011-10-18 21:21:00 +02007561 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007562 Py_ssize_t pos = unicode_offset;
7563 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007564 /* Ideally, we should get reason from FormatMessage. This is the Windows
7565 2000 English version of the message. */
7566 const char *reason = "invalid character";
7567 /* 4=maximum length of a UTF-8 sequence */
7568 char buffer[4];
7569 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7570 Py_ssize_t outsize;
7571 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007572 PyObject *errorHandler = NULL;
7573 PyObject *exc = NULL;
7574 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007575 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007576 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007577 PyObject *rep;
7578 int ret = -1;
7579
7580 assert(insize > 0);
7581
7582 encoding = code_page_name(code_page, &encoding_obj);
7583 if (encoding == NULL)
7584 return -1;
7585
7586 if (errors == NULL || strcmp(errors, "strict") == 0) {
7587 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7588 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007589 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007590 if (exc != NULL) {
7591 PyCodec_StrictErrors(exc);
7592 Py_DECREF(exc);
7593 }
7594 Py_XDECREF(encoding_obj);
7595 return -1;
7596 }
7597
7598 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7599 pusedDefaultChar = &usedDefaultChar;
7600 else
7601 pusedDefaultChar = NULL;
7602
7603 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7604 PyErr_NoMemory();
7605 goto error;
7606 }
7607 outsize = insize * Py_ARRAY_LENGTH(buffer);
7608
7609 if (*outbytes == NULL) {
7610 /* Create string object */
7611 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7612 if (*outbytes == NULL)
7613 goto error;
7614 out = PyBytes_AS_STRING(*outbytes);
7615 }
7616 else {
7617 /* Extend string object */
7618 Py_ssize_t n = PyBytes_Size(*outbytes);
7619 if (n > PY_SSIZE_T_MAX - outsize) {
7620 PyErr_NoMemory();
7621 goto error;
7622 }
7623 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7624 goto error;
7625 out = PyBytes_AS_STRING(*outbytes) + n;
7626 }
7627
7628 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007629 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007630 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007631 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7632 wchar_t chars[2];
7633 int charsize;
7634 if (ch < 0x10000) {
7635 chars[0] = (wchar_t)ch;
7636 charsize = 1;
7637 }
7638 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007639 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7640 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007641 charsize = 2;
7642 }
7643
Victor Stinner3a50e702011-10-18 21:21:00 +02007644 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007645 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 buffer, Py_ARRAY_LENGTH(buffer),
7647 NULL, pusedDefaultChar);
7648 if (outsize > 0) {
7649 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7650 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007651 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007652 memcpy(out, buffer, outsize);
7653 out += outsize;
7654 continue;
7655 }
7656 }
7657 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7658 PyErr_SetFromWindowsErr(0);
7659 goto error;
7660 }
7661
Victor Stinner3a50e702011-10-18 21:21:00 +02007662 rep = unicode_encode_call_errorhandler(
7663 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007664 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007665 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007666 if (rep == NULL)
7667 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007668 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007669
7670 if (PyBytes_Check(rep)) {
7671 outsize = PyBytes_GET_SIZE(rep);
7672 if (outsize != 1) {
7673 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7674 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7675 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7676 Py_DECREF(rep);
7677 goto error;
7678 }
7679 out = PyBytes_AS_STRING(*outbytes) + offset;
7680 }
7681 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7682 out += outsize;
7683 }
7684 else {
7685 Py_ssize_t i;
7686 enum PyUnicode_Kind kind;
7687 void *data;
7688
Benjamin Petersonbac79492012-01-14 13:34:47 -05007689 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007690 Py_DECREF(rep);
7691 goto error;
7692 }
7693
7694 outsize = PyUnicode_GET_LENGTH(rep);
7695 if (outsize != 1) {
7696 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7697 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7698 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7699 Py_DECREF(rep);
7700 goto error;
7701 }
7702 out = PyBytes_AS_STRING(*outbytes) + offset;
7703 }
7704 kind = PyUnicode_KIND(rep);
7705 data = PyUnicode_DATA(rep);
7706 for (i=0; i < outsize; i++) {
7707 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7708 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007709 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007710 encoding, unicode,
7711 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007712 "unable to encode error handler result to ASCII");
7713 Py_DECREF(rep);
7714 goto error;
7715 }
7716 *out = (unsigned char)ch;
7717 out++;
7718 }
7719 }
7720 Py_DECREF(rep);
7721 }
7722 /* write a NUL byte */
7723 *out = 0;
7724 outsize = out - PyBytes_AS_STRING(*outbytes);
7725 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7726 if (_PyBytes_Resize(outbytes, outsize) < 0)
7727 goto error;
7728 ret = 0;
7729
7730error:
7731 Py_XDECREF(encoding_obj);
7732 Py_XDECREF(errorHandler);
7733 Py_XDECREF(exc);
7734 return ret;
7735}
7736
Victor Stinner3a50e702011-10-18 21:21:00 +02007737static PyObject *
7738encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007739 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007740 const char *errors)
7741{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007742 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007743 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007744 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007745 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007746
Victor Stinner29dacf22015-01-26 16:41:32 +01007747 if (!PyUnicode_Check(unicode)) {
7748 PyErr_BadArgument();
7749 return NULL;
7750 }
7751
Benjamin Petersonbac79492012-01-14 13:34:47 -05007752 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007753 return NULL;
7754 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007755
Victor Stinner3a50e702011-10-18 21:21:00 +02007756 if (code_page < 0) {
7757 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7758 return NULL;
7759 }
7760
Martin v. Löwis3d325192011-11-04 18:23:06 +01007761 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007762 return PyBytes_FromStringAndSize(NULL, 0);
7763
Victor Stinner7581cef2011-11-03 22:32:33 +01007764 offset = 0;
7765 do
7766 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007767#ifdef NEED_RETRY
Miss Islington (bot)f93c15a2019-08-21 16:53:56 -07007768 if (len > DECODING_CHUNK_SIZE) {
7769 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007770 done = 0;
7771 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007772 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007773#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007774 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007775 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007776 done = 1;
7777 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007778
Victor Stinner76a31a62011-11-04 00:05:13 +01007779 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007780 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007781 errors);
7782 if (ret == -2)
7783 ret = encode_code_page_errors(code_page, &outbytes,
7784 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007785 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007786 if (ret < 0) {
7787 Py_XDECREF(outbytes);
7788 return NULL;
7789 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007790
Victor Stinner7581cef2011-11-03 22:32:33 +01007791 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007792 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007793 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007794
Victor Stinner3a50e702011-10-18 21:21:00 +02007795 return outbytes;
7796}
7797
7798PyObject *
7799PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7800 Py_ssize_t size,
7801 const char *errors)
7802{
Victor Stinner7581cef2011-11-03 22:32:33 +01007803 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007804 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007805 if (unicode == NULL)
7806 return NULL;
7807 res = encode_code_page(CP_ACP, unicode, errors);
7808 Py_DECREF(unicode);
7809 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007810}
7811
7812PyObject *
7813PyUnicode_EncodeCodePage(int code_page,
7814 PyObject *unicode,
7815 const char *errors)
7816{
Victor Stinner7581cef2011-11-03 22:32:33 +01007817 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007818}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007819
Alexander Belopolsky40018472011-02-26 01:02:56 +00007820PyObject *
7821PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007822{
Victor Stinner7581cef2011-11-03 22:32:33 +01007823 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007824}
7825
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007826#undef NEED_RETRY
7827
Steve Dowercc16be82016-09-08 10:35:16 -07007828#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007829
Guido van Rossumd57fd912000-03-10 22:53:23 +00007830/* --- Character Mapping Codec -------------------------------------------- */
7831
Victor Stinnerfb161b12013-04-18 01:44:27 +02007832static int
7833charmap_decode_string(const char *s,
7834 Py_ssize_t size,
7835 PyObject *mapping,
7836 const char *errors,
7837 _PyUnicodeWriter *writer)
7838{
7839 const char *starts = s;
7840 const char *e;
7841 Py_ssize_t startinpos, endinpos;
7842 PyObject *errorHandler = NULL, *exc = NULL;
7843 Py_ssize_t maplen;
7844 enum PyUnicode_Kind mapkind;
7845 void *mapdata;
7846 Py_UCS4 x;
7847 unsigned char ch;
7848
7849 if (PyUnicode_READY(mapping) == -1)
7850 return -1;
7851
7852 maplen = PyUnicode_GET_LENGTH(mapping);
7853 mapdata = PyUnicode_DATA(mapping);
7854 mapkind = PyUnicode_KIND(mapping);
7855
7856 e = s + size;
7857
7858 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7859 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7860 * is disabled in encoding aliases, latin1 is preferred because
7861 * its implementation is faster. */
7862 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7863 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7864 Py_UCS4 maxchar = writer->maxchar;
7865
7866 assert (writer->kind == PyUnicode_1BYTE_KIND);
7867 while (s < e) {
7868 ch = *s;
7869 x = mapdata_ucs1[ch];
7870 if (x > maxchar) {
7871 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7872 goto onError;
7873 maxchar = writer->maxchar;
7874 outdata = (Py_UCS1 *)writer->data;
7875 }
7876 outdata[writer->pos] = x;
7877 writer->pos++;
7878 ++s;
7879 }
7880 return 0;
7881 }
7882
7883 while (s < e) {
7884 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7885 enum PyUnicode_Kind outkind = writer->kind;
7886 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7887 if (outkind == PyUnicode_1BYTE_KIND) {
7888 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7889 Py_UCS4 maxchar = writer->maxchar;
7890 while (s < e) {
7891 ch = *s;
7892 x = mapdata_ucs2[ch];
7893 if (x > maxchar)
7894 goto Error;
7895 outdata[writer->pos] = x;
7896 writer->pos++;
7897 ++s;
7898 }
7899 break;
7900 }
7901 else if (outkind == PyUnicode_2BYTE_KIND) {
7902 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7903 while (s < e) {
7904 ch = *s;
7905 x = mapdata_ucs2[ch];
7906 if (x == 0xFFFE)
7907 goto Error;
7908 outdata[writer->pos] = x;
7909 writer->pos++;
7910 ++s;
7911 }
7912 break;
7913 }
7914 }
7915 ch = *s;
7916
7917 if (ch < maplen)
7918 x = PyUnicode_READ(mapkind, mapdata, ch);
7919 else
7920 x = 0xfffe; /* invalid value */
7921Error:
7922 if (x == 0xfffe)
7923 {
7924 /* undefined mapping */
7925 startinpos = s-starts;
7926 endinpos = startinpos+1;
7927 if (unicode_decode_call_errorhandler_writer(
7928 errors, &errorHandler,
7929 "charmap", "character maps to <undefined>",
7930 &starts, &e, &startinpos, &endinpos, &exc, &s,
7931 writer)) {
7932 goto onError;
7933 }
7934 continue;
7935 }
7936
7937 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7938 goto onError;
7939 ++s;
7940 }
7941 Py_XDECREF(errorHandler);
7942 Py_XDECREF(exc);
7943 return 0;
7944
7945onError:
7946 Py_XDECREF(errorHandler);
7947 Py_XDECREF(exc);
7948 return -1;
7949}
7950
7951static int
7952charmap_decode_mapping(const char *s,
7953 Py_ssize_t size,
7954 PyObject *mapping,
7955 const char *errors,
7956 _PyUnicodeWriter *writer)
7957{
7958 const char *starts = s;
7959 const char *e;
7960 Py_ssize_t startinpos, endinpos;
7961 PyObject *errorHandler = NULL, *exc = NULL;
7962 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02007963 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02007964
7965 e = s + size;
7966
7967 while (s < e) {
7968 ch = *s;
7969
7970 /* Get mapping (char ordinal -> integer, Unicode char or None) */
7971 key = PyLong_FromLong((long)ch);
7972 if (key == NULL)
7973 goto onError;
7974
7975 item = PyObject_GetItem(mapping, key);
7976 Py_DECREF(key);
7977 if (item == NULL) {
7978 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7979 /* No mapping found means: mapping is undefined. */
7980 PyErr_Clear();
7981 goto Undefined;
7982 } else
7983 goto onError;
7984 }
7985
7986 /* Apply mapping */
7987 if (item == Py_None)
7988 goto Undefined;
7989 if (PyLong_Check(item)) {
7990 long value = PyLong_AS_LONG(item);
7991 if (value == 0xFFFE)
7992 goto Undefined;
7993 if (value < 0 || value > MAX_UNICODE) {
7994 PyErr_Format(PyExc_TypeError,
7995 "character mapping must be in range(0x%lx)",
7996 (unsigned long)MAX_UNICODE + 1);
7997 goto onError;
7998 }
7999
8000 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8001 goto onError;
8002 }
8003 else if (PyUnicode_Check(item)) {
8004 if (PyUnicode_READY(item) == -1)
8005 goto onError;
8006 if (PyUnicode_GET_LENGTH(item) == 1) {
8007 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8008 if (value == 0xFFFE)
8009 goto Undefined;
8010 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8011 goto onError;
8012 }
8013 else {
8014 writer->overallocate = 1;
8015 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8016 goto onError;
8017 }
8018 }
8019 else {
8020 /* wrong return value */
8021 PyErr_SetString(PyExc_TypeError,
8022 "character mapping must return integer, None or str");
8023 goto onError;
8024 }
8025 Py_CLEAR(item);
8026 ++s;
8027 continue;
8028
8029Undefined:
8030 /* undefined mapping */
8031 Py_CLEAR(item);
8032 startinpos = s-starts;
8033 endinpos = startinpos+1;
8034 if (unicode_decode_call_errorhandler_writer(
8035 errors, &errorHandler,
8036 "charmap", "character maps to <undefined>",
8037 &starts, &e, &startinpos, &endinpos, &exc, &s,
8038 writer)) {
8039 goto onError;
8040 }
8041 }
8042 Py_XDECREF(errorHandler);
8043 Py_XDECREF(exc);
8044 return 0;
8045
8046onError:
8047 Py_XDECREF(item);
8048 Py_XDECREF(errorHandler);
8049 Py_XDECREF(exc);
8050 return -1;
8051}
8052
Alexander Belopolsky40018472011-02-26 01:02:56 +00008053PyObject *
8054PyUnicode_DecodeCharmap(const char *s,
8055 Py_ssize_t size,
8056 PyObject *mapping,
8057 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008058{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008059 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008060
Guido van Rossumd57fd912000-03-10 22:53:23 +00008061 /* Default to Latin-1 */
8062 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008063 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008064
Guido van Rossumd57fd912000-03-10 22:53:23 +00008065 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008066 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008067 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008068 writer.min_length = size;
8069 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008070 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008071
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008072 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008073 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8074 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008075 }
8076 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008077 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8078 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008079 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008080 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008081
Benjamin Peterson29060642009-01-31 22:14:21 +00008082 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008083 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008084 return NULL;
8085}
8086
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008087/* Charmap encoding: the lookup table */
8088
Alexander Belopolsky40018472011-02-26 01:02:56 +00008089struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008090 PyObject_HEAD
8091 unsigned char level1[32];
8092 int count2, count3;
8093 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008094};
8095
8096static PyObject*
8097encoding_map_size(PyObject *obj, PyObject* args)
8098{
8099 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008100 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008101 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008102}
8103
8104static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008105 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008106 PyDoc_STR("Return the size (in bytes) of this object") },
8107 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008108};
8109
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008110static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008111 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008112 "EncodingMap", /*tp_name*/
8113 sizeof(struct encoding_map), /*tp_basicsize*/
8114 0, /*tp_itemsize*/
8115 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008116 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008117 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008118 0, /*tp_getattr*/
8119 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008120 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008121 0, /*tp_repr*/
8122 0, /*tp_as_number*/
8123 0, /*tp_as_sequence*/
8124 0, /*tp_as_mapping*/
8125 0, /*tp_hash*/
8126 0, /*tp_call*/
8127 0, /*tp_str*/
8128 0, /*tp_getattro*/
8129 0, /*tp_setattro*/
8130 0, /*tp_as_buffer*/
8131 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8132 0, /*tp_doc*/
8133 0, /*tp_traverse*/
8134 0, /*tp_clear*/
8135 0, /*tp_richcompare*/
8136 0, /*tp_weaklistoffset*/
8137 0, /*tp_iter*/
8138 0, /*tp_iternext*/
8139 encoding_map_methods, /*tp_methods*/
8140 0, /*tp_members*/
8141 0, /*tp_getset*/
8142 0, /*tp_base*/
8143 0, /*tp_dict*/
8144 0, /*tp_descr_get*/
8145 0, /*tp_descr_set*/
8146 0, /*tp_dictoffset*/
8147 0, /*tp_init*/
8148 0, /*tp_alloc*/
8149 0, /*tp_new*/
8150 0, /*tp_free*/
8151 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008152};
8153
8154PyObject*
8155PyUnicode_BuildEncodingMap(PyObject* string)
8156{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008157 PyObject *result;
8158 struct encoding_map *mresult;
8159 int i;
8160 int need_dict = 0;
8161 unsigned char level1[32];
8162 unsigned char level2[512];
8163 unsigned char *mlevel1, *mlevel2, *mlevel3;
8164 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008165 int kind;
8166 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008167 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008168 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008169
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008170 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008171 PyErr_BadArgument();
8172 return NULL;
8173 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008174 kind = PyUnicode_KIND(string);
8175 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008176 length = PyUnicode_GET_LENGTH(string);
8177 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008178 memset(level1, 0xFF, sizeof level1);
8179 memset(level2, 0xFF, sizeof level2);
8180
8181 /* If there isn't a one-to-one mapping of NULL to \0,
8182 or if there are non-BMP characters, we need to use
8183 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008184 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008185 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008186 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008187 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008188 ch = PyUnicode_READ(kind, data, i);
8189 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008190 need_dict = 1;
8191 break;
8192 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008193 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008194 /* unmapped character */
8195 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008196 l1 = ch >> 11;
8197 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008198 if (level1[l1] == 0xFF)
8199 level1[l1] = count2++;
8200 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008201 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008202 }
8203
8204 if (count2 >= 0xFF || count3 >= 0xFF)
8205 need_dict = 1;
8206
8207 if (need_dict) {
8208 PyObject *result = PyDict_New();
8209 PyObject *key, *value;
8210 if (!result)
8211 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008212 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008213 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008214 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008215 if (!key || !value)
8216 goto failed1;
8217 if (PyDict_SetItem(result, key, value) == -1)
8218 goto failed1;
8219 Py_DECREF(key);
8220 Py_DECREF(value);
8221 }
8222 return result;
8223 failed1:
8224 Py_XDECREF(key);
8225 Py_XDECREF(value);
8226 Py_DECREF(result);
8227 return NULL;
8228 }
8229
8230 /* Create a three-level trie */
8231 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8232 16*count2 + 128*count3 - 1);
8233 if (!result)
8234 return PyErr_NoMemory();
8235 PyObject_Init(result, &EncodingMapType);
8236 mresult = (struct encoding_map*)result;
8237 mresult->count2 = count2;
8238 mresult->count3 = count3;
8239 mlevel1 = mresult->level1;
8240 mlevel2 = mresult->level23;
8241 mlevel3 = mresult->level23 + 16*count2;
8242 memcpy(mlevel1, level1, 32);
8243 memset(mlevel2, 0xFF, 16*count2);
8244 memset(mlevel3, 0, 128*count3);
8245 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008246 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008247 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008248 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8249 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008250 /* unmapped character */
8251 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008252 o1 = ch>>11;
8253 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008254 i2 = 16*mlevel1[o1] + o2;
8255 if (mlevel2[i2] == 0xFF)
8256 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008257 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008258 i3 = 128*mlevel2[i2] + o3;
8259 mlevel3[i3] = i;
8260 }
8261 return result;
8262}
8263
8264static int
Victor Stinner22168992011-11-20 17:09:18 +01008265encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008266{
8267 struct encoding_map *map = (struct encoding_map*)mapping;
8268 int l1 = c>>11;
8269 int l2 = (c>>7) & 0xF;
8270 int l3 = c & 0x7F;
8271 int i;
8272
Victor Stinner22168992011-11-20 17:09:18 +01008273 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008274 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008275 if (c == 0)
8276 return 0;
8277 /* level 1*/
8278 i = map->level1[l1];
8279 if (i == 0xFF) {
8280 return -1;
8281 }
8282 /* level 2*/
8283 i = map->level23[16*i+l2];
8284 if (i == 0xFF) {
8285 return -1;
8286 }
8287 /* level 3 */
8288 i = map->level23[16*map->count2 + 128*i + l3];
8289 if (i == 0) {
8290 return -1;
8291 }
8292 return i;
8293}
8294
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008295/* Lookup the character ch in the mapping. If the character
8296 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008297 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008298static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008299charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008300{
Christian Heimes217cfd12007-12-02 14:31:20 +00008301 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008302 PyObject *x;
8303
8304 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008305 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008306 x = PyObject_GetItem(mapping, w);
8307 Py_DECREF(w);
8308 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008309 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8310 /* No mapping found means: mapping is undefined. */
8311 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008312 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008313 } else
8314 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008315 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008316 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008317 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008318 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008319 long value = PyLong_AS_LONG(x);
8320 if (value < 0 || value > 255) {
8321 PyErr_SetString(PyExc_TypeError,
8322 "character mapping must be in range(256)");
8323 Py_DECREF(x);
8324 return NULL;
8325 }
8326 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008327 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008328 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008329 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008330 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008331 /* wrong return value */
8332 PyErr_Format(PyExc_TypeError,
8333 "character mapping must return integer, bytes or None, not %.400s",
8334 x->ob_type->tp_name);
8335 Py_DECREF(x);
8336 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008337 }
8338}
8339
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008340static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008341charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008342{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008343 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8344 /* exponentially overallocate to minimize reallocations */
8345 if (requiredsize < 2*outsize)
8346 requiredsize = 2*outsize;
8347 if (_PyBytes_Resize(outobj, requiredsize))
8348 return -1;
8349 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008350}
8351
Benjamin Peterson14339b62009-01-31 16:36:08 +00008352typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008353 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008354} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008355/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008356 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008357 space is available. Return a new reference to the object that
8358 was put in the output buffer, or Py_None, if the mapping was undefined
8359 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008360 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008361static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008362charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008363 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008364{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008365 PyObject *rep;
8366 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008367 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008368
Christian Heimes90aa7642007-12-19 02:45:37 +00008369 if (Py_TYPE(mapping) == &EncodingMapType) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008371 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008372 if (res == -1)
8373 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008374 if (outsize<requiredsize)
8375 if (charmapencode_resize(outobj, outpos, requiredsize))
8376 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008377 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008378 outstart[(*outpos)++] = (char)res;
8379 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008380 }
8381
8382 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008383 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008384 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008385 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008386 Py_DECREF(rep);
8387 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008388 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008389 if (PyLong_Check(rep)) {
8390 Py_ssize_t requiredsize = *outpos+1;
8391 if (outsize<requiredsize)
8392 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8393 Py_DECREF(rep);
8394 return enc_EXCEPTION;
8395 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008396 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008397 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008398 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008399 else {
8400 const char *repchars = PyBytes_AS_STRING(rep);
8401 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8402 Py_ssize_t requiredsize = *outpos+repsize;
8403 if (outsize<requiredsize)
8404 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8405 Py_DECREF(rep);
8406 return enc_EXCEPTION;
8407 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008408 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008409 memcpy(outstart + *outpos, repchars, repsize);
8410 *outpos += repsize;
8411 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008412 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008413 Py_DECREF(rep);
8414 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008415}
8416
8417/* handle an error in PyUnicode_EncodeCharmap
8418 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008419static int
8420charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008421 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008423 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008424 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008425{
8426 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008427 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008428 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008429 enum PyUnicode_Kind kind;
8430 void *data;
8431 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008432 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008433 Py_ssize_t collstartpos = *inpos;
8434 Py_ssize_t collendpos = *inpos+1;
8435 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008436 const char *encoding = "charmap";
8437 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008438 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008439 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008440 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008441
Benjamin Petersonbac79492012-01-14 13:34:47 -05008442 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008443 return -1;
8444 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008445 /* find all unencodable characters */
8446 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008447 PyObject *rep;
Christian Heimes90aa7642007-12-19 02:45:37 +00008448 if (Py_TYPE(mapping) == &EncodingMapType) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008449 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008450 val = encoding_map_lookup(ch, mapping);
8451 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008452 break;
8453 ++collendpos;
8454 continue;
8455 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008456
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008457 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8458 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008459 if (rep==NULL)
8460 return -1;
8461 else if (rep!=Py_None) {
8462 Py_DECREF(rep);
8463 break;
8464 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008465 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008466 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008467 }
8468 /* cache callback name lookup
8469 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008470 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008471 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008472
8473 switch (*error_handler) {
8474 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008475 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008476 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008477
8478 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008479 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008480 x = charmapencode_output('?', mapping, res, respos);
8481 if (x==enc_EXCEPTION) {
8482 return -1;
8483 }
8484 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008485 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008486 return -1;
8487 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008488 }
8489 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008490 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008491 *inpos = collendpos;
8492 break;
Victor Stinner50149202015-09-22 00:26:54 +02008493
8494 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008495 /* generate replacement (temporarily (mis)uses p) */
8496 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008497 char buffer[2+29+1+1];
8498 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008499 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 for (cp = buffer; *cp; ++cp) {
8501 x = charmapencode_output(*cp, mapping, res, respos);
8502 if (x==enc_EXCEPTION)
8503 return -1;
8504 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008505 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008506 return -1;
8507 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008508 }
8509 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008510 *inpos = collendpos;
8511 break;
Victor Stinner50149202015-09-22 00:26:54 +02008512
Benjamin Peterson14339b62009-01-31 16:36:08 +00008513 default:
Victor Stinner50149202015-09-22 00:26:54 +02008514 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008515 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008516 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008517 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008518 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008519 if (PyBytes_Check(repunicode)) {
8520 /* Directly copy bytes result to output. */
8521 Py_ssize_t outsize = PyBytes_Size(*res);
8522 Py_ssize_t requiredsize;
8523 repsize = PyBytes_Size(repunicode);
8524 requiredsize = *respos + repsize;
8525 if (requiredsize > outsize)
8526 /* Make room for all additional bytes. */
8527 if (charmapencode_resize(res, respos, requiredsize)) {
8528 Py_DECREF(repunicode);
8529 return -1;
8530 }
8531 memcpy(PyBytes_AsString(*res) + *respos,
8532 PyBytes_AsString(repunicode), repsize);
8533 *respos += repsize;
8534 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008535 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008536 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008537 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008538 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008539 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008540 Py_DECREF(repunicode);
8541 return -1;
8542 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008543 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008544 data = PyUnicode_DATA(repunicode);
8545 kind = PyUnicode_KIND(repunicode);
8546 for (index = 0; index < repsize; index++) {
8547 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8548 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008549 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008550 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008551 return -1;
8552 }
8553 else if (x==enc_FAILED) {
8554 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008555 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008556 return -1;
8557 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008558 }
8559 *inpos = newpos;
8560 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 }
8562 return 0;
8563}
8564
Alexander Belopolsky40018472011-02-26 01:02:56 +00008565PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008566_PyUnicode_EncodeCharmap(PyObject *unicode,
8567 PyObject *mapping,
8568 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008569{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008570 /* output object */
8571 PyObject *res = NULL;
8572 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008573 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008574 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008575 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008576 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008577 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008578 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008579 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008580 void *data;
8581 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008582
Benjamin Petersonbac79492012-01-14 13:34:47 -05008583 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008584 return NULL;
8585 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008586 data = PyUnicode_DATA(unicode);
8587 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008588
Guido van Rossumd57fd912000-03-10 22:53:23 +00008589 /* Default to Latin-1 */
8590 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008591 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008592
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008593 /* allocate enough for a simple encoding without
8594 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008595 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008596 if (res == NULL)
8597 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008598 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008599 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008600
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008601 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008602 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008603 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008604 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008605 if (x==enc_EXCEPTION) /* error */
8606 goto onError;
8607 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008608 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008609 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008610 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008611 &res, &respos)) {
8612 goto onError;
8613 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008614 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008615 else
8616 /* done with this character => adjust input position */
8617 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008618 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008619
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008620 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008621 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008622 if (_PyBytes_Resize(&res, respos) < 0)
8623 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008624
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008625 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008626 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008627 return res;
8628
Benjamin Peterson29060642009-01-31 22:14:21 +00008629 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008630 Py_XDECREF(res);
8631 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008632 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008633 return NULL;
8634}
8635
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008636/* Deprecated */
8637PyObject *
8638PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8639 Py_ssize_t size,
8640 PyObject *mapping,
8641 const char *errors)
8642{
8643 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008644 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008645 if (unicode == NULL)
8646 return NULL;
8647 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8648 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008649 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008650}
8651
Alexander Belopolsky40018472011-02-26 01:02:56 +00008652PyObject *
8653PyUnicode_AsCharmapString(PyObject *unicode,
8654 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008655{
8656 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008657 PyErr_BadArgument();
8658 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008659 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008660 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008661}
8662
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008663/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008664static void
8665make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008666 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008667 Py_ssize_t startpos, Py_ssize_t endpos,
8668 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008669{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008670 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008671 *exceptionObject = _PyUnicodeTranslateError_Create(
8672 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008673 }
8674 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008675 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8676 goto onError;
8677 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8678 goto onError;
8679 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8680 goto onError;
8681 return;
8682 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008683 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008684 }
8685}
8686
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008687/* error handling callback helper:
8688 build arguments, call the callback and check the arguments,
8689 put the result into newpos and return the replacement string, which
8690 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008691static PyObject *
8692unicode_translate_call_errorhandler(const char *errors,
8693 PyObject **errorHandler,
8694 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008695 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008696 Py_ssize_t startpos, Py_ssize_t endpos,
8697 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008698{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008699 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008700
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008701 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008702 PyObject *restuple;
8703 PyObject *resunicode;
8704
8705 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008706 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008707 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008708 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 }
8710
8711 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008712 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008713 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008714 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008715
Victor Stinnerde4ae3d2016-12-04 22:59:09 +01008716 restuple = PyObject_CallFunctionObjArgs(
8717 *errorHandler, *exceptionObject, NULL);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008718 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008720 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008721 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008722 Py_DECREF(restuple);
8723 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008724 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008725 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008726 &resunicode, &i_newpos)) {
8727 Py_DECREF(restuple);
8728 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008729 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008730 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008731 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008732 else
8733 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008734 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008735 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008736 Py_DECREF(restuple);
8737 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008738 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008739 Py_INCREF(resunicode);
8740 Py_DECREF(restuple);
8741 return resunicode;
8742}
8743
8744/* Lookup the character ch in the mapping and put the result in result,
8745 which must be decrefed by the caller.
8746 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008747static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008748charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008749{
Christian Heimes217cfd12007-12-02 14:31:20 +00008750 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008751 PyObject *x;
8752
8753 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008754 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008755 x = PyObject_GetItem(mapping, w);
8756 Py_DECREF(w);
8757 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008758 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8759 /* No mapping found means: use 1:1 mapping. */
8760 PyErr_Clear();
8761 *result = NULL;
8762 return 0;
8763 } else
8764 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008765 }
8766 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008767 *result = x;
8768 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008769 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008770 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008771 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008772 if (value < 0 || value > MAX_UNICODE) {
8773 PyErr_Format(PyExc_ValueError,
8774 "character mapping must be in range(0x%x)",
8775 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008776 Py_DECREF(x);
8777 return -1;
8778 }
8779 *result = x;
8780 return 0;
8781 }
8782 else if (PyUnicode_Check(x)) {
8783 *result = x;
8784 return 0;
8785 }
8786 else {
8787 /* wrong return value */
8788 PyErr_SetString(PyExc_TypeError,
8789 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008790 Py_DECREF(x);
8791 return -1;
8792 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008793}
Victor Stinner1194ea02014-04-04 19:37:40 +02008794
8795/* lookup the character, write the result into the writer.
8796 Return 1 if the result was written into the writer, return 0 if the mapping
8797 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008798static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008799charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8800 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008801{
Victor Stinner1194ea02014-04-04 19:37:40 +02008802 PyObject *item;
8803
8804 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008805 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008806
8807 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008808 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008809 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008810 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008811 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008812 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008813 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008814
8815 if (item == Py_None) {
8816 Py_DECREF(item);
8817 return 0;
8818 }
8819
8820 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008821 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8822 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8823 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008824 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8825 Py_DECREF(item);
8826 return -1;
8827 }
8828 Py_DECREF(item);
8829 return 1;
8830 }
8831
8832 if (!PyUnicode_Check(item)) {
8833 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008835 }
8836
8837 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8838 Py_DECREF(item);
8839 return -1;
8840 }
8841
8842 Py_DECREF(item);
8843 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008844}
8845
Victor Stinner89a76ab2014-04-05 11:44:04 +02008846static int
8847unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8848 Py_UCS1 *translate)
8849{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008850 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008851 int ret = 0;
8852
Victor Stinner89a76ab2014-04-05 11:44:04 +02008853 if (charmaptranslate_lookup(ch, mapping, &item)) {
8854 return -1;
8855 }
8856
8857 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008858 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008859 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008860 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008861 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008862 /* not found => default to 1:1 mapping */
8863 translate[ch] = ch;
8864 return 1;
8865 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008866 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008867 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008868 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8869 used it */
8870 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008871 /* invalid character or character outside ASCII:
8872 skip the fast translate */
8873 goto exit;
8874 }
8875 translate[ch] = (Py_UCS1)replace;
8876 }
8877 else if (PyUnicode_Check(item)) {
8878 Py_UCS4 replace;
8879
8880 if (PyUnicode_READY(item) == -1) {
8881 Py_DECREF(item);
8882 return -1;
8883 }
8884 if (PyUnicode_GET_LENGTH(item) != 1)
8885 goto exit;
8886
8887 replace = PyUnicode_READ_CHAR(item, 0);
8888 if (replace > 127)
8889 goto exit;
8890 translate[ch] = (Py_UCS1)replace;
8891 }
8892 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008893 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02008894 goto exit;
8895 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02008896 ret = 1;
8897
Benjamin Peterson1365de72014-04-07 20:15:41 -04008898 exit:
8899 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008900 return ret;
8901}
8902
8903/* Fast path for ascii => ascii translation. Return 1 if the whole string
8904 was translated into writer, return 0 if the input string was partially
8905 translated into writer, raise an exception and return -1 on error. */
8906static int
8907unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008908 _PyUnicodeWriter *writer, int ignore,
8909 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008910{
Victor Stinner872b2912014-04-05 14:27:07 +02008911 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008912 Py_ssize_t len;
8913 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02008914 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008915
Victor Stinner89a76ab2014-04-05 11:44:04 +02008916 len = PyUnicode_GET_LENGTH(input);
8917
Victor Stinner872b2912014-04-05 14:27:07 +02008918 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008919
8920 in = PyUnicode_1BYTE_DATA(input);
8921 end = in + len;
8922
8923 assert(PyUnicode_IS_ASCII(writer->buffer));
8924 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8925 out = PyUnicode_1BYTE_DATA(writer->buffer);
8926
Victor Stinner872b2912014-04-05 14:27:07 +02008927 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008928 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02008929 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008930 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02008931 int translate = unicode_fast_translate_lookup(mapping, ch,
8932 ascii_table);
8933 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008934 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02008935 if (translate == 0)
8936 goto exit;
8937 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02008938 }
Victor Stinner872b2912014-04-05 14:27:07 +02008939 if (ch2 == 0xfe) {
8940 if (ignore)
8941 continue;
8942 goto exit;
8943 }
8944 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02008945 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02008946 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008947 }
Victor Stinner872b2912014-04-05 14:27:07 +02008948 res = 1;
8949
8950exit:
8951 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01008952 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02008953 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008954}
8955
Victor Stinner3222da22015-10-01 22:07:32 +02008956static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008957_PyUnicode_TranslateCharmap(PyObject *input,
8958 PyObject *mapping,
8959 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008960{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008961 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02008962 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008963 Py_ssize_t size, i;
8964 int kind;
8965 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02008966 _PyUnicodeWriter writer;
8967 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008968 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008969 PyObject *errorHandler = NULL;
8970 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008971 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008972 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008973
Guido van Rossumd57fd912000-03-10 22:53:23 +00008974 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008975 PyErr_BadArgument();
8976 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008977 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008978
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008979 if (PyUnicode_READY(input) == -1)
8980 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02008981 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008982 kind = PyUnicode_KIND(input);
8983 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008984
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03008985 if (size == 0)
8986 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008987
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008988 /* allocate enough for a simple 1:1 translation without
8989 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02008990 _PyUnicodeWriter_Init(&writer);
8991 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008992 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008993
Victor Stinner872b2912014-04-05 14:27:07 +02008994 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8995
Victor Stinner33798672016-03-01 21:59:58 +01008996 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02008997 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01008998 if (PyUnicode_IS_ASCII(input)) {
8999 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9000 if (res < 0) {
9001 _PyUnicodeWriter_Dealloc(&writer);
9002 return NULL;
9003 }
9004 if (res == 1)
9005 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009006 }
Victor Stinner33798672016-03-01 21:59:58 +01009007 else {
9008 i = 0;
9009 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009010
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009011 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009012 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009013 int translate;
9014 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9015 Py_ssize_t newpos;
9016 /* startpos for collecting untranslatable chars */
9017 Py_ssize_t collstart;
9018 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009019 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009020
Victor Stinner1194ea02014-04-04 19:37:40 +02009021 ch = PyUnicode_READ(kind, data, i);
9022 translate = charmaptranslate_output(ch, mapping, &writer);
9023 if (translate < 0)
9024 goto onError;
9025
9026 if (translate != 0) {
9027 /* it worked => adjust input pointer */
9028 ++i;
9029 continue;
9030 }
9031
9032 /* untranslatable character */
9033 collstart = i;
9034 collend = i+1;
9035
9036 /* find all untranslatable characters */
9037 while (collend < size) {
9038 PyObject *x;
9039 ch = PyUnicode_READ(kind, data, collend);
9040 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009041 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009042 Py_XDECREF(x);
9043 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009044 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009045 ++collend;
9046 }
9047
9048 if (ignore) {
9049 i = collend;
9050 }
9051 else {
9052 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9053 reason, input, &exc,
9054 collstart, collend, &newpos);
9055 if (repunicode == NULL)
9056 goto onError;
9057 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009058 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009059 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009060 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009061 Py_DECREF(repunicode);
9062 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009063 }
9064 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009065 Py_XDECREF(exc);
9066 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009067 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009068
Benjamin Peterson29060642009-01-31 22:14:21 +00009069 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009070 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009071 Py_XDECREF(exc);
9072 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009073 return NULL;
9074}
9075
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076/* Deprecated. Use PyUnicode_Translate instead. */
9077PyObject *
9078PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9079 Py_ssize_t size,
9080 PyObject *mapping,
9081 const char *errors)
9082{
Christian Heimes5f520f42012-09-11 14:03:25 +02009083 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009084 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009085 if (!unicode)
9086 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009087 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9088 Py_DECREF(unicode);
9089 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009090}
9091
Alexander Belopolsky40018472011-02-26 01:02:56 +00009092PyObject *
9093PyUnicode_Translate(PyObject *str,
9094 PyObject *mapping,
9095 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009096{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009097 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009098 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009099 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009100}
Tim Petersced69f82003-09-16 20:30:58 +00009101
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102PyObject *
9103_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9104{
9105 if (!PyUnicode_Check(unicode)) {
9106 PyErr_BadInternalCall();
9107 return NULL;
9108 }
9109 if (PyUnicode_READY(unicode) == -1)
9110 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009111 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009112 /* If the string is already ASCII, just return the same string */
9113 Py_INCREF(unicode);
9114 return unicode;
9115 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009116
9117 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9118 PyObject *result = PyUnicode_New(len, 127);
9119 if (result == NULL) {
9120 return NULL;
9121 }
9122
9123 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9124 int kind = PyUnicode_KIND(unicode);
9125 const void *data = PyUnicode_DATA(unicode);
9126 Py_ssize_t i;
9127 for (i = 0; i < len; ++i) {
9128 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9129 if (ch < 127) {
9130 out[i] = ch;
9131 }
9132 else if (Py_UNICODE_ISSPACE(ch)) {
9133 out[i] = ' ';
9134 }
9135 else {
9136 int decimal = Py_UNICODE_TODECIMAL(ch);
9137 if (decimal < 0) {
9138 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009139 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009140 _PyUnicode_LENGTH(result) = i + 1;
9141 break;
9142 }
9143 out[i] = '0' + decimal;
9144 }
9145 }
9146
INADA Naoki16dfca42018-07-14 12:06:43 +09009147 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009148 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009149}
9150
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009151PyObject *
9152PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9153 Py_ssize_t length)
9154{
Victor Stinnerf0124502011-11-21 23:12:56 +01009155 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009156 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009157 Py_UCS4 maxchar;
9158 enum PyUnicode_Kind kind;
9159 void *data;
9160
Victor Stinner99d7ad02012-02-22 13:37:39 +01009161 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009162 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009163 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009164 if (ch > 127) {
9165 int decimal = Py_UNICODE_TODECIMAL(ch);
9166 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009167 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009168 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009169 }
9170 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009171
9172 /* Copy to a new string */
9173 decimal = PyUnicode_New(length, maxchar);
9174 if (decimal == NULL)
9175 return decimal;
9176 kind = PyUnicode_KIND(decimal);
9177 data = PyUnicode_DATA(decimal);
9178 /* Iterate over code points */
9179 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009180 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009181 if (ch > 127) {
9182 int decimal = Py_UNICODE_TODECIMAL(ch);
9183 if (decimal >= 0)
9184 ch = '0' + decimal;
9185 }
9186 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009187 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009188 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009189}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009190/* --- Decimal Encoder ---------------------------------------------------- */
9191
Alexander Belopolsky40018472011-02-26 01:02:56 +00009192int
9193PyUnicode_EncodeDecimal(Py_UNICODE *s,
9194 Py_ssize_t length,
9195 char *output,
9196 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009197{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009198 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009199 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009200 enum PyUnicode_Kind kind;
9201 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009202
9203 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009204 PyErr_BadArgument();
9205 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009206 }
9207
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009208 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009209 if (unicode == NULL)
9210 return -1;
9211
Victor Stinner42bf7752011-11-21 22:52:58 +01009212 kind = PyUnicode_KIND(unicode);
9213 data = PyUnicode_DATA(unicode);
9214
Victor Stinnerb84d7232011-11-22 01:50:07 +01009215 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009216 PyObject *exc;
9217 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009218 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009219 Py_ssize_t startpos;
9220
9221 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009222
Benjamin Peterson29060642009-01-31 22:14:21 +00009223 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009224 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009225 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009226 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009227 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009228 decimal = Py_UNICODE_TODECIMAL(ch);
9229 if (decimal >= 0) {
9230 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009231 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009232 continue;
9233 }
9234 if (0 < ch && ch < 256) {
9235 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009236 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009237 continue;
9238 }
Victor Stinner6345be92011-11-25 20:09:01 +01009239
Victor Stinner42bf7752011-11-21 22:52:58 +01009240 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009241 exc = NULL;
9242 raise_encode_exception(&exc, "decimal", unicode,
9243 startpos, startpos+1,
9244 "invalid decimal Unicode string");
9245 Py_XDECREF(exc);
9246 Py_DECREF(unicode);
9247 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009248 }
9249 /* 0-terminate the output string */
9250 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009251 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009252 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009253}
9254
Guido van Rossumd57fd912000-03-10 22:53:23 +00009255/* --- Helpers ------------------------------------------------------------ */
9256
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009257/* helper macro to fixup start/end slice values */
9258#define ADJUST_INDICES(start, end, len) \
9259 if (end > len) \
9260 end = len; \
9261 else if (end < 0) { \
9262 end += len; \
9263 if (end < 0) \
9264 end = 0; \
9265 } \
9266 if (start < 0) { \
9267 start += len; \
9268 if (start < 0) \
9269 start = 0; \
9270 }
9271
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009272static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009273any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009274 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009275 Py_ssize_t end,
9276 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009277{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009278 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009279 void *buf1, *buf2;
9280 Py_ssize_t len1, len2, result;
9281
9282 kind1 = PyUnicode_KIND(s1);
9283 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009284 if (kind1 < kind2)
9285 return -1;
9286
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009287 len1 = PyUnicode_GET_LENGTH(s1);
9288 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009289 ADJUST_INDICES(start, end, len1);
9290 if (end - start < len2)
9291 return -1;
9292
9293 buf1 = PyUnicode_DATA(s1);
9294 buf2 = PyUnicode_DATA(s2);
9295 if (len2 == 1) {
9296 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9297 result = findchar((const char *)buf1 + kind1*start,
9298 kind1, end - start, ch, direction);
9299 if (result == -1)
9300 return -1;
9301 else
9302 return start + result;
9303 }
9304
9305 if (kind2 != kind1) {
9306 buf2 = _PyUnicode_AsKind(s2, kind1);
9307 if (!buf2)
9308 return -2;
9309 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009310
Victor Stinner794d5672011-10-10 03:21:36 +02009311 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009312 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009313 case PyUnicode_1BYTE_KIND:
9314 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9315 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9316 else
9317 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9318 break;
9319 case PyUnicode_2BYTE_KIND:
9320 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9321 break;
9322 case PyUnicode_4BYTE_KIND:
9323 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9324 break;
9325 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009326 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009327 }
9328 }
9329 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009330 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009331 case PyUnicode_1BYTE_KIND:
9332 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9333 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9334 else
9335 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9336 break;
9337 case PyUnicode_2BYTE_KIND:
9338 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9339 break;
9340 case PyUnicode_4BYTE_KIND:
9341 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9342 break;
9343 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009344 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009345 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009346 }
9347
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009348 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009349 PyMem_Free(buf2);
9350
9351 return result;
9352}
9353
Victor Stinner59423e32018-11-26 13:40:01 +01009354/* _PyUnicode_InsertThousandsGrouping() helper functions */
9355#include "stringlib/localeutil.h"
9356
9357/**
9358 * InsertThousandsGrouping:
9359 * @writer: Unicode writer.
9360 * @n_buffer: Number of characters in @buffer.
9361 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9362 * @d_pos: Start of digits string.
9363 * @n_digits: The number of digits in the string, in which we want
9364 * to put the grouping chars.
9365 * @min_width: The minimum width of the digits in the output string.
9366 * Output will be zero-padded on the left to fill.
9367 * @grouping: see definition in localeconv().
9368 * @thousands_sep: see definition in localeconv().
9369 *
9370 * There are 2 modes: counting and filling. If @writer is NULL,
9371 * we are in counting mode, else filling mode.
9372 * If counting, the required buffer size is returned.
9373 * If filling, we know the buffer will be large enough, so we don't
9374 * need to pass in the buffer size.
9375 * Inserts thousand grouping characters (as defined by grouping and
9376 * thousands_sep) into @writer.
9377 *
9378 * Return value: -1 on error, number of characters otherwise.
9379 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009380Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009381_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009382 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009383 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009384 PyObject *digits,
9385 Py_ssize_t d_pos,
9386 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009387 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009388 const char *grouping,
9389 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009390 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009391{
Xtreak3f7983a2019-01-07 20:39:14 +05309392 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009393 if (writer) {
9394 assert(digits != NULL);
9395 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009396 }
9397 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009398 assert(digits == NULL);
9399 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009400 }
Victor Stinner59423e32018-11-26 13:40:01 +01009401 assert(0 <= d_pos);
9402 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009403 assert(grouping != NULL);
9404
9405 if (digits != NULL) {
9406 if (PyUnicode_READY(digits) == -1) {
9407 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009408 }
Victor Stinner59423e32018-11-26 13:40:01 +01009409 }
9410 if (PyUnicode_READY(thousands_sep) == -1) {
9411 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009412 }
9413
Victor Stinner59423e32018-11-26 13:40:01 +01009414 Py_ssize_t count = 0;
9415 Py_ssize_t n_zeros;
9416 int loop_broken = 0;
9417 int use_separator = 0; /* First time through, don't append the
9418 separator. They only go between
9419 groups. */
9420 Py_ssize_t buffer_pos;
9421 Py_ssize_t digits_pos;
9422 Py_ssize_t len;
9423 Py_ssize_t n_chars;
9424 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9425 be looked at */
9426 /* A generator that returns all of the grouping widths, until it
9427 returns 0. */
9428 GroupGenerator groupgen;
9429 GroupGenerator_init(&groupgen, grouping);
9430 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9431
9432 /* if digits are not grouped, thousands separator
9433 should be an empty string */
9434 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9435
9436 digits_pos = d_pos + n_digits;
9437 if (writer) {
9438 buffer_pos = writer->pos + n_buffer;
9439 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9440 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009441 }
Victor Stinner59423e32018-11-26 13:40:01 +01009442 else {
9443 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009444 }
Victor Stinner59423e32018-11-26 13:40:01 +01009445
9446 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009447 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009448 }
Victor Stinner59423e32018-11-26 13:40:01 +01009449
9450 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9451 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9452 n_zeros = Py_MAX(0, len - remaining);
9453 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9454
9455 /* Use n_zero zero's and n_chars chars */
9456
9457 /* Count only, don't do anything. */
9458 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9459
9460 /* Copy into the writer. */
9461 InsertThousandsGrouping_fill(writer, &buffer_pos,
9462 digits, &digits_pos,
9463 n_chars, n_zeros,
9464 use_separator ? thousands_sep : NULL,
9465 thousands_sep_len, maxchar);
9466
9467 /* Use a separator next time. */
9468 use_separator = 1;
9469
9470 remaining -= n_chars;
9471 min_width -= len;
9472
9473 if (remaining <= 0 && min_width <= 0) {
9474 loop_broken = 1;
9475 break;
9476 }
9477 min_width -= thousands_sep_len;
9478 }
9479 if (!loop_broken) {
9480 /* We left the loop without using a break statement. */
9481
9482 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9483 n_zeros = Py_MAX(0, len - remaining);
9484 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9485
9486 /* Use n_zero zero's and n_chars chars */
9487 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9488
9489 /* Copy into the writer. */
9490 InsertThousandsGrouping_fill(writer, &buffer_pos,
9491 digits, &digits_pos,
9492 n_chars, n_zeros,
9493 use_separator ? thousands_sep : NULL,
9494 thousands_sep_len, maxchar);
9495 }
9496 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009497}
9498
9499
Alexander Belopolsky40018472011-02-26 01:02:56 +00009500Py_ssize_t
9501PyUnicode_Count(PyObject *str,
9502 PyObject *substr,
9503 Py_ssize_t start,
9504 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009505{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009506 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009507 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009508 void *buf1 = NULL, *buf2 = NULL;
9509 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009510
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009511 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009512 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009513
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009514 kind1 = PyUnicode_KIND(str);
9515 kind2 = PyUnicode_KIND(substr);
9516 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009517 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009518
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009519 len1 = PyUnicode_GET_LENGTH(str);
9520 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009521 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009522 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009523 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009524
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009525 buf1 = PyUnicode_DATA(str);
9526 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009527 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009528 buf2 = _PyUnicode_AsKind(substr, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009529 if (!buf2)
9530 goto onError;
9531 }
9532
9533 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009534 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009535 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009536 result = asciilib_count(
9537 ((Py_UCS1*)buf1) + start, end - start,
9538 buf2, len2, PY_SSIZE_T_MAX
9539 );
9540 else
9541 result = ucs1lib_count(
9542 ((Py_UCS1*)buf1) + start, end - start,
9543 buf2, len2, PY_SSIZE_T_MAX
9544 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009545 break;
9546 case PyUnicode_2BYTE_KIND:
9547 result = ucs2lib_count(
9548 ((Py_UCS2*)buf1) + start, end - start,
9549 buf2, len2, PY_SSIZE_T_MAX
9550 );
9551 break;
9552 case PyUnicode_4BYTE_KIND:
9553 result = ucs4lib_count(
9554 ((Py_UCS4*)buf1) + start, end - start,
9555 buf2, len2, PY_SSIZE_T_MAX
9556 );
9557 break;
9558 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009559 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009560 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009561
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009562 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009563 PyMem_Free(buf2);
9564
Guido van Rossumd57fd912000-03-10 22:53:23 +00009565 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009566 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009567 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009568 PyMem_Free(buf2);
9569 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009570}
9571
Alexander Belopolsky40018472011-02-26 01:02:56 +00009572Py_ssize_t
9573PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009574 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009575 Py_ssize_t start,
9576 Py_ssize_t end,
9577 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009578{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009579 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009580 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009581
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009582 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009583}
9584
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009585Py_ssize_t
9586PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9587 Py_ssize_t start, Py_ssize_t end,
9588 int direction)
9589{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009590 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009591 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009592 if (PyUnicode_READY(str) == -1)
9593 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009594 len = PyUnicode_GET_LENGTH(str);
9595 ADJUST_INDICES(start, end, len);
9596 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009597 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009598 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009599 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9600 kind, end-start, ch, direction);
9601 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009602 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009603 else
9604 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009605}
9606
Alexander Belopolsky40018472011-02-26 01:02:56 +00009607static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009608tailmatch(PyObject *self,
9609 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009610 Py_ssize_t start,
9611 Py_ssize_t end,
9612 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009613{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009614 int kind_self;
9615 int kind_sub;
9616 void *data_self;
9617 void *data_sub;
9618 Py_ssize_t offset;
9619 Py_ssize_t i;
9620 Py_ssize_t end_sub;
9621
9622 if (PyUnicode_READY(self) == -1 ||
9623 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009624 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009625
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009626 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9627 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009628 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009629 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009630
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009631 if (PyUnicode_GET_LENGTH(substring) == 0)
9632 return 1;
9633
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009634 kind_self = PyUnicode_KIND(self);
9635 data_self = PyUnicode_DATA(self);
9636 kind_sub = PyUnicode_KIND(substring);
9637 data_sub = PyUnicode_DATA(substring);
9638 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9639
9640 if (direction > 0)
9641 offset = end;
9642 else
9643 offset = start;
9644
9645 if (PyUnicode_READ(kind_self, data_self, offset) ==
9646 PyUnicode_READ(kind_sub, data_sub, 0) &&
9647 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9648 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9649 /* If both are of the same kind, memcmp is sufficient */
9650 if (kind_self == kind_sub) {
9651 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009652 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009653 data_sub,
9654 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009655 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009656 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009657 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009658 else {
9659 /* We do not need to compare 0 and len(substring)-1 because
9660 the if statement above ensured already that they are equal
9661 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009662 for (i = 1; i < end_sub; ++i) {
9663 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9664 PyUnicode_READ(kind_sub, data_sub, i))
9665 return 0;
9666 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009667 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009668 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009669 }
9670
9671 return 0;
9672}
9673
Alexander Belopolsky40018472011-02-26 01:02:56 +00009674Py_ssize_t
9675PyUnicode_Tailmatch(PyObject *str,
9676 PyObject *substr,
9677 Py_ssize_t start,
9678 Py_ssize_t end,
9679 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009681 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009682 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009683
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009684 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685}
9686
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009687static PyObject *
9688ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009689{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009690 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9691 char *resdata, *data = PyUnicode_DATA(self);
9692 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009693
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009694 res = PyUnicode_New(len, 127);
9695 if (res == NULL)
9696 return NULL;
9697 resdata = PyUnicode_DATA(res);
9698 if (lower)
9699 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009701 _Py_bytes_upper(resdata, data, len);
9702 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009703}
9704
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009706handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009707{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009708 Py_ssize_t j;
9709 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009710 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009711 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009712
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009713 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9714
9715 where ! is a negation and \p{xxx} is a character with property xxx.
9716 */
9717 for (j = i - 1; j >= 0; j--) {
9718 c = PyUnicode_READ(kind, data, j);
9719 if (!_PyUnicode_IsCaseIgnorable(c))
9720 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009721 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009722 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9723 if (final_sigma) {
9724 for (j = i + 1; j < length; j++) {
9725 c = PyUnicode_READ(kind, data, j);
9726 if (!_PyUnicode_IsCaseIgnorable(c))
9727 break;
9728 }
9729 final_sigma = j == length || !_PyUnicode_IsCased(c);
9730 }
9731 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009732}
9733
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009734static int
9735lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9736 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009737{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009738 /* Obscure special case. */
9739 if (c == 0x3A3) {
9740 mapped[0] = handle_capital_sigma(kind, data, length, i);
9741 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009742 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009743 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009744}
9745
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009746static Py_ssize_t
9747do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009748{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009749 Py_ssize_t i, k = 0;
9750 int n_res, j;
9751 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009752
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009753 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009754 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009755 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009756 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009757 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009758 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009759 for (i = 1; i < length; i++) {
9760 c = PyUnicode_READ(kind, data, i);
9761 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9762 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009763 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009764 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009765 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009766 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009767 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009768}
9769
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009770static Py_ssize_t
9771do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9772 Py_ssize_t i, k = 0;
9773
9774 for (i = 0; i < length; i++) {
9775 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9776 int n_res, j;
9777 if (Py_UNICODE_ISUPPER(c)) {
9778 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9779 }
9780 else if (Py_UNICODE_ISLOWER(c)) {
9781 n_res = _PyUnicode_ToUpperFull(c, mapped);
9782 }
9783 else {
9784 n_res = 1;
9785 mapped[0] = c;
9786 }
9787 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009788 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009789 res[k++] = mapped[j];
9790 }
9791 }
9792 return k;
9793}
9794
9795static Py_ssize_t
9796do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9797 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009798{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009799 Py_ssize_t i, k = 0;
9800
9801 for (i = 0; i < length; i++) {
9802 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9803 int n_res, j;
9804 if (lower)
9805 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9806 else
9807 n_res = _PyUnicode_ToUpperFull(c, mapped);
9808 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009809 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009810 res[k++] = mapped[j];
9811 }
9812 }
9813 return k;
9814}
9815
9816static Py_ssize_t
9817do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9818{
9819 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9820}
9821
9822static Py_ssize_t
9823do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9824{
9825 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9826}
9827
Benjamin Petersone51757f2012-01-12 21:10:29 -05009828static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009829do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9830{
9831 Py_ssize_t i, k = 0;
9832
9833 for (i = 0; i < length; i++) {
9834 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9835 Py_UCS4 mapped[3];
9836 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9837 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009838 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009839 res[k++] = mapped[j];
9840 }
9841 }
9842 return k;
9843}
9844
9845static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009846do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9847{
9848 Py_ssize_t i, k = 0;
9849 int previous_is_cased;
9850
9851 previous_is_cased = 0;
9852 for (i = 0; i < length; i++) {
9853 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9854 Py_UCS4 mapped[3];
9855 int n_res, j;
9856
9857 if (previous_is_cased)
9858 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9859 else
9860 n_res = _PyUnicode_ToTitleFull(c, mapped);
9861
9862 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009863 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009864 res[k++] = mapped[j];
9865 }
9866
9867 previous_is_cased = _PyUnicode_IsCased(c);
9868 }
9869 return k;
9870}
9871
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872static PyObject *
9873case_operation(PyObject *self,
9874 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9875{
9876 PyObject *res = NULL;
9877 Py_ssize_t length, newlength = 0;
9878 int kind, outkind;
9879 void *data, *outdata;
9880 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9881
Benjamin Petersoneea48462012-01-16 14:28:50 -05009882 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009883
9884 kind = PyUnicode_KIND(self);
9885 data = PyUnicode_DATA(self);
9886 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +02009887 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -04009888 PyErr_SetString(PyExc_OverflowError, "string is too long");
9889 return NULL;
9890 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -04009891 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009892 if (tmp == NULL)
9893 return PyErr_NoMemory();
9894 newlength = perform(kind, data, length, tmp, &maxchar);
9895 res = PyUnicode_New(newlength, maxchar);
9896 if (res == NULL)
9897 goto leave;
9898 tmpend = tmp + newlength;
9899 outdata = PyUnicode_DATA(res);
9900 outkind = PyUnicode_KIND(res);
9901 switch (outkind) {
9902 case PyUnicode_1BYTE_KIND:
9903 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9904 break;
9905 case PyUnicode_2BYTE_KIND:
9906 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9907 break;
9908 case PyUnicode_4BYTE_KIND:
9909 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9910 break;
9911 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009912 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009913 }
9914 leave:
9915 PyMem_FREE(tmp);
9916 return res;
9917}
9918
Tim Peters8ce9f162004-08-27 01:49:32 +00009919PyObject *
9920PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009921{
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009922 PyObject *res;
9923 PyObject *fseq;
9924 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009925 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009926
Benjamin Peterson9743b2c2014-02-15 13:02:52 -05009927 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +00009928 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009929 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +00009930 }
9931
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009932 /* NOTE: the following code can't call back into Python code,
9933 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +00009934 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009935
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009936 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +00009937 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009938 res = _PyUnicode_JoinArray(separator, items, seqlen);
9939 Py_DECREF(fseq);
9940 return res;
9941}
9942
9943PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +02009944_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +03009945{
9946 PyObject *res = NULL; /* the result */
9947 PyObject *sep = NULL;
9948 Py_ssize_t seplen;
9949 PyObject *item;
9950 Py_ssize_t sz, i, res_offset;
9951 Py_UCS4 maxchar;
9952 Py_UCS4 item_maxchar;
9953 int use_memcpy;
9954 unsigned char *res_data = NULL, *sep_data = NULL;
9955 PyObject *last_obj;
9956 unsigned int kind = 0;
9957
Tim Peters05eba1f2004-08-27 21:32:02 +00009958 /* If empty sequence, return u"". */
9959 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02009960 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +00009961 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02009962
Tim Peters05eba1f2004-08-27 21:32:02 +00009963 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +02009964 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +02009965 if (seqlen == 1) {
9966 if (PyUnicode_CheckExact(items[0])) {
9967 res = items[0];
9968 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +02009969 return res;
9970 }
Victor Stinnerdd077322011-10-07 17:02:31 +02009971 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +02009972 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +00009973 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +00009974 else {
Victor Stinneracf47b82011-10-06 12:32:37 +02009975 /* Set up sep and seplen */
9976 if (separator == NULL) {
9977 /* fall back to a blank space separator */
9978 sep = PyUnicode_FromOrdinal(' ');
9979 if (!sep)
9980 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +02009981 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +02009982 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +00009983 }
Victor Stinneracf47b82011-10-06 12:32:37 +02009984 else {
9985 if (!PyUnicode_Check(separator)) {
9986 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02009987 "separator: expected str instance,"
9988 " %.80s found",
9989 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +02009990 goto onError;
9991 }
9992 if (PyUnicode_READY(separator))
9993 goto onError;
9994 sep = separator;
9995 seplen = PyUnicode_GET_LENGTH(separator);
9996 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9997 /* inc refcount to keep this code path symmetric with the
9998 above case of a blank separator */
9999 Py_INCREF(sep);
10000 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010001 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010002 }
10003
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010004 /* There are at least two things to join, or else we have a subclass
10005 * of str in the sequence.
10006 * Do a pre-pass to figure out the total amount of space we'll
10007 * need (sz), and see whether all argument are strings.
10008 */
10009 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010010#ifdef Py_DEBUG
10011 use_memcpy = 0;
10012#else
10013 use_memcpy = 1;
10014#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010015 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010016 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010017 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010018 if (!PyUnicode_Check(item)) {
10019 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010020 "sequence item %zd: expected str instance,"
10021 " %.80s found",
10022 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010023 goto onError;
10024 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010025 if (PyUnicode_READY(item) == -1)
10026 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010027 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010028 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010029 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010030 if (i != 0) {
10031 add_sz += seplen;
10032 }
10033 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010034 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010035 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010036 goto onError;
10037 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010038 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010039 if (use_memcpy && last_obj != NULL) {
10040 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10041 use_memcpy = 0;
10042 }
10043 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010044 }
Tim Petersced69f82003-09-16 20:30:58 +000010045
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010046 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010047 if (res == NULL)
10048 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010049
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010050 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010051#ifdef Py_DEBUG
10052 use_memcpy = 0;
10053#else
10054 if (use_memcpy) {
10055 res_data = PyUnicode_1BYTE_DATA(res);
10056 kind = PyUnicode_KIND(res);
10057 if (seplen != 0)
10058 sep_data = PyUnicode_1BYTE_DATA(sep);
10059 }
10060#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010061 if (use_memcpy) {
10062 for (i = 0; i < seqlen; ++i) {
10063 Py_ssize_t itemlen;
10064 item = items[i];
10065
10066 /* Copy item, and maybe the separator. */
10067 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010068 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010069 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010070 kind * seplen);
10071 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010072 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010073
10074 itemlen = PyUnicode_GET_LENGTH(item);
10075 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010076 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010077 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010078 kind * itemlen);
10079 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010080 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010081 }
10082 assert(res_data == PyUnicode_1BYTE_DATA(res)
10083 + kind * PyUnicode_GET_LENGTH(res));
10084 }
10085 else {
10086 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10087 Py_ssize_t itemlen;
10088 item = items[i];
10089
10090 /* Copy item, and maybe the separator. */
10091 if (i && seplen != 0) {
10092 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10093 res_offset += seplen;
10094 }
10095
10096 itemlen = PyUnicode_GET_LENGTH(item);
10097 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010098 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010099 res_offset += itemlen;
10100 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010101 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010102 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010103 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010104
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010105 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010106 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010107 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010108
Benjamin Peterson29060642009-01-31 22:14:21 +000010109 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010110 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010111 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010112 return NULL;
10113}
10114
Victor Stinnerd3f08822012-05-29 12:57:52 +020010115void
10116_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10117 Py_UCS4 fill_char)
10118{
10119 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010120 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010121 assert(PyUnicode_IS_READY(unicode));
10122 assert(unicode_modifiable(unicode));
10123 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10124 assert(start >= 0);
10125 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010126 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010127}
10128
Victor Stinner3fe55312012-01-04 00:33:50 +010010129Py_ssize_t
10130PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10131 Py_UCS4 fill_char)
10132{
10133 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010134
10135 if (!PyUnicode_Check(unicode)) {
10136 PyErr_BadInternalCall();
10137 return -1;
10138 }
10139 if (PyUnicode_READY(unicode) == -1)
10140 return -1;
10141 if (unicode_check_modifiable(unicode))
10142 return -1;
10143
Victor Stinnerd3f08822012-05-29 12:57:52 +020010144 if (start < 0) {
10145 PyErr_SetString(PyExc_IndexError, "string index out of range");
10146 return -1;
10147 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010148 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10149 PyErr_SetString(PyExc_ValueError,
10150 "fill character is bigger than "
10151 "the string maximum character");
10152 return -1;
10153 }
10154
10155 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10156 length = Py_MIN(maxlen, length);
10157 if (length <= 0)
10158 return 0;
10159
Victor Stinnerd3f08822012-05-29 12:57:52 +020010160 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010161 return length;
10162}
10163
Victor Stinner9310abb2011-10-05 00:59:23 +020010164static PyObject *
10165pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010166 Py_ssize_t left,
10167 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010168 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010169{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010170 PyObject *u;
10171 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010172 int kind;
10173 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010174
10175 if (left < 0)
10176 left = 0;
10177 if (right < 0)
10178 right = 0;
10179
Victor Stinnerc4b49542011-12-11 22:44:26 +010010180 if (left == 0 && right == 0)
10181 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010182
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010183 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10184 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010185 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10186 return NULL;
10187 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010188 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010189 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010190 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010191 if (!u)
10192 return NULL;
10193
10194 kind = PyUnicode_KIND(u);
10195 data = PyUnicode_DATA(u);
10196 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010197 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010198 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010199 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010200 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010201 assert(_PyUnicode_CheckConsistency(u, 1));
10202 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010203}
10204
Alexander Belopolsky40018472011-02-26 01:02:56 +000010205PyObject *
10206PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010207{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010208 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010209
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010210 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010212
Benjamin Petersonead6b532011-12-20 17:23:42 -060010213 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010214 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010215 if (PyUnicode_IS_ASCII(string))
10216 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010217 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010218 PyUnicode_GET_LENGTH(string), keepends);
10219 else
10220 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010221 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010222 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010223 break;
10224 case PyUnicode_2BYTE_KIND:
10225 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010226 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010227 PyUnicode_GET_LENGTH(string), keepends);
10228 break;
10229 case PyUnicode_4BYTE_KIND:
10230 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010231 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010232 PyUnicode_GET_LENGTH(string), keepends);
10233 break;
10234 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010235 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010236 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010237 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010238}
10239
Alexander Belopolsky40018472011-02-26 01:02:56 +000010240static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010241split(PyObject *self,
10242 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010243 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010244{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010245 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010246 void *buf1, *buf2;
10247 Py_ssize_t len1, len2;
10248 PyObject* out;
10249
Guido van Rossumd57fd912000-03-10 22:53:23 +000010250 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010251 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010252
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010253 if (PyUnicode_READY(self) == -1)
10254 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010255
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010256 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010257 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010258 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010259 if (PyUnicode_IS_ASCII(self))
10260 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010261 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010262 PyUnicode_GET_LENGTH(self), maxcount
10263 );
10264 else
10265 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010266 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010267 PyUnicode_GET_LENGTH(self), maxcount
10268 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010269 case PyUnicode_2BYTE_KIND:
10270 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010271 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010272 PyUnicode_GET_LENGTH(self), maxcount
10273 );
10274 case PyUnicode_4BYTE_KIND:
10275 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010276 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010277 PyUnicode_GET_LENGTH(self), maxcount
10278 );
10279 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010280 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010281 }
10282
10283 if (PyUnicode_READY(substring) == -1)
10284 return NULL;
10285
10286 kind1 = PyUnicode_KIND(self);
10287 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010288 len1 = PyUnicode_GET_LENGTH(self);
10289 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010290 if (kind1 < kind2 || len1 < len2) {
10291 out = PyList_New(1);
10292 if (out == NULL)
10293 return NULL;
10294 Py_INCREF(self);
10295 PyList_SET_ITEM(out, 0, self);
10296 return out;
10297 }
10298 buf1 = PyUnicode_DATA(self);
10299 buf2 = PyUnicode_DATA(substring);
10300 if (kind2 != kind1) {
10301 buf2 = _PyUnicode_AsKind(substring, kind1);
10302 if (!buf2)
10303 return NULL;
10304 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010306 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010307 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010308 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10309 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010310 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010311 else
10312 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010313 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010314 break;
10315 case PyUnicode_2BYTE_KIND:
10316 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010317 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010318 break;
10319 case PyUnicode_4BYTE_KIND:
10320 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010321 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010322 break;
10323 default:
10324 out = NULL;
10325 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010326 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010327 PyMem_Free(buf2);
10328 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010329}
10330
Alexander Belopolsky40018472011-02-26 01:02:56 +000010331static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010332rsplit(PyObject *self,
10333 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010334 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010335{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010336 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010337 void *buf1, *buf2;
10338 Py_ssize_t len1, len2;
10339 PyObject* out;
10340
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010341 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010342 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010343
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010344 if (PyUnicode_READY(self) == -1)
10345 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010346
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010348 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010349 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010350 if (PyUnicode_IS_ASCII(self))
10351 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010352 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010353 PyUnicode_GET_LENGTH(self), maxcount
10354 );
10355 else
10356 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010357 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010358 PyUnicode_GET_LENGTH(self), maxcount
10359 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010360 case PyUnicode_2BYTE_KIND:
10361 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010362 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010363 PyUnicode_GET_LENGTH(self), maxcount
10364 );
10365 case PyUnicode_4BYTE_KIND:
10366 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010367 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 PyUnicode_GET_LENGTH(self), maxcount
10369 );
10370 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010371 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010372 }
10373
10374 if (PyUnicode_READY(substring) == -1)
10375 return NULL;
10376
10377 kind1 = PyUnicode_KIND(self);
10378 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010379 len1 = PyUnicode_GET_LENGTH(self);
10380 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010381 if (kind1 < kind2 || len1 < len2) {
10382 out = PyList_New(1);
10383 if (out == NULL)
10384 return NULL;
10385 Py_INCREF(self);
10386 PyList_SET_ITEM(out, 0, self);
10387 return out;
10388 }
10389 buf1 = PyUnicode_DATA(self);
10390 buf2 = PyUnicode_DATA(substring);
10391 if (kind2 != kind1) {
10392 buf2 = _PyUnicode_AsKind(substring, kind1);
10393 if (!buf2)
10394 return NULL;
10395 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010397 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010398 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010399 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10400 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010401 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010402 else
10403 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010404 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010405 break;
10406 case PyUnicode_2BYTE_KIND:
10407 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010408 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010409 break;
10410 case PyUnicode_4BYTE_KIND:
10411 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010412 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010413 break;
10414 default:
10415 out = NULL;
10416 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010417 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010418 PyMem_Free(buf2);
10419 return out;
10420}
10421
10422static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010423anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10424 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010425{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010426 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010427 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010428 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10429 return asciilib_find(buf1, len1, buf2, len2, offset);
10430 else
10431 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010432 case PyUnicode_2BYTE_KIND:
10433 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10434 case PyUnicode_4BYTE_KIND:
10435 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10436 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010437 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010438}
10439
10440static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010441anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10442 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010443{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010444 switch (kind) {
10445 case PyUnicode_1BYTE_KIND:
10446 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10447 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10448 else
10449 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10450 case PyUnicode_2BYTE_KIND:
10451 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10452 case PyUnicode_4BYTE_KIND:
10453 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10454 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010455 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010456}
10457
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010458static void
10459replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10460 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10461{
10462 int kind = PyUnicode_KIND(u);
10463 void *data = PyUnicode_DATA(u);
10464 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10465 if (kind == PyUnicode_1BYTE_KIND) {
10466 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10467 (Py_UCS1 *)data + len,
10468 u1, u2, maxcount);
10469 }
10470 else if (kind == PyUnicode_2BYTE_KIND) {
10471 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10472 (Py_UCS2 *)data + len,
10473 u1, u2, maxcount);
10474 }
10475 else {
10476 assert(kind == PyUnicode_4BYTE_KIND);
10477 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10478 (Py_UCS4 *)data + len,
10479 u1, u2, maxcount);
10480 }
10481}
10482
Alexander Belopolsky40018472011-02-26 01:02:56 +000010483static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010484replace(PyObject *self, PyObject *str1,
10485 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010486{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 PyObject *u;
10488 char *sbuf = PyUnicode_DATA(self);
10489 char *buf1 = PyUnicode_DATA(str1);
10490 char *buf2 = PyUnicode_DATA(str2);
10491 int srelease = 0, release1 = 0, release2 = 0;
10492 int skind = PyUnicode_KIND(self);
10493 int kind1 = PyUnicode_KIND(str1);
10494 int kind2 = PyUnicode_KIND(str2);
10495 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10496 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10497 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010498 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010499 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010500
10501 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010502 maxcount = PY_SSIZE_T_MAX;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010503 else if (maxcount == 0 || slen == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010504 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010505
Victor Stinner59de0ee2011-10-07 10:01:28 +020010506 if (str1 == str2)
10507 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010508
Victor Stinner49a0a212011-10-12 23:46:10 +020010509 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010510 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10511 if (maxchar < maxchar_str1)
10512 /* substring too wide to be present */
10513 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010514 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10515 /* Replacing str1 with str2 may cause a maxchar reduction in the
10516 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010517 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010518 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010519
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010521 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010522 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010523 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010525 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010526 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010527 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010528
Victor Stinner69ed0f42013-04-09 21:48:24 +020010529 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010530 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010531 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010532 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010533 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010534 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010535 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010536 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010537
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010538 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10539 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010540 }
10541 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 int rkind = skind;
10543 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010544 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010545
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010546 if (kind1 < rkind) {
10547 /* widen substring */
10548 buf1 = _PyUnicode_AsKind(str1, rkind);
10549 if (!buf1) goto error;
10550 release1 = 1;
10551 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010552 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010553 if (i < 0)
10554 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010555 if (rkind > kind2) {
10556 /* widen replacement */
10557 buf2 = _PyUnicode_AsKind(str2, rkind);
10558 if (!buf2) goto error;
10559 release2 = 1;
10560 }
10561 else if (rkind < kind2) {
10562 /* widen self and buf1 */
10563 rkind = kind2;
10564 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010565 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010566 sbuf = _PyUnicode_AsKind(self, rkind);
10567 if (!sbuf) goto error;
10568 srelease = 1;
10569 buf1 = _PyUnicode_AsKind(str1, rkind);
10570 if (!buf1) goto error;
10571 release1 = 1;
10572 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010573 u = PyUnicode_New(slen, maxchar);
10574 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010575 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010576 assert(PyUnicode_KIND(u) == rkind);
10577 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010578
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010579 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010580 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010581 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010582 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010583 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010584 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010585
10586 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010587 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010588 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010589 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010590 if (i == -1)
10591 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010592 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010593 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010594 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010595 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010596 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010597 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010598 }
10599 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010600 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010601 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 int rkind = skind;
10603 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010604
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010605 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010606 /* widen substring */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010607 buf1 = _PyUnicode_AsKind(str1, rkind);
10608 if (!buf1) goto error;
10609 release1 = 1;
10610 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010611 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010612 if (n == 0)
10613 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010614 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010615 /* widen replacement */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010616 buf2 = _PyUnicode_AsKind(str2, rkind);
10617 if (!buf2) goto error;
10618 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010620 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010621 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010622 rkind = kind2;
10623 sbuf = _PyUnicode_AsKind(self, rkind);
10624 if (!sbuf) goto error;
10625 srelease = 1;
10626 if (release1) PyMem_Free(buf1);
Antoine Pitrou6d5ad222012-11-17 23:28:17 +010010627 release1 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010628 buf1 = _PyUnicode_AsKind(str1, rkind);
10629 if (!buf1) goto error;
10630 release1 = 1;
10631 }
10632 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10633 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010634 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010635 PyErr_SetString(PyExc_OverflowError,
10636 "replace string is too long");
10637 goto error;
10638 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010639 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010640 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010641 _Py_INCREF_UNICODE_EMPTY();
10642 if (!unicode_empty)
10643 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010644 u = unicode_empty;
10645 goto done;
10646 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010647 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010648 PyErr_SetString(PyExc_OverflowError,
10649 "replace string is too long");
10650 goto error;
10651 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010652 u = PyUnicode_New(new_size, maxchar);
10653 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010655 assert(PyUnicode_KIND(u) == rkind);
10656 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010657 ires = i = 0;
10658 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010659 while (n-- > 0) {
10660 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010661 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010662 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010663 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010664 if (j == -1)
10665 break;
10666 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010667 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010668 memcpy(res + rkind * ires,
10669 sbuf + rkind * i,
10670 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010671 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010672 }
10673 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010674 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010675 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010677 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010678 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010679 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010680 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010681 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010682 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010683 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010684 memcpy(res + rkind * ires,
10685 sbuf + rkind * i,
10686 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010687 }
10688 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010689 /* interleave */
10690 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010691 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010692 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010693 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010694 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010695 if (--n <= 0)
10696 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010697 memcpy(res + rkind * ires,
10698 sbuf + rkind * i,
10699 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010700 ires++;
10701 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010702 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010703 memcpy(res + rkind * ires,
10704 sbuf + rkind * i,
10705 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010706 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010707 }
10708
10709 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010710 unicode_adjust_maxchar(&u);
10711 if (u == NULL)
10712 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010713 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010714
10715 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 if (srelease)
10717 PyMem_FREE(sbuf);
10718 if (release1)
10719 PyMem_FREE(buf1);
10720 if (release2)
10721 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010722 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010724
Benjamin Peterson29060642009-01-31 22:14:21 +000010725 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010726 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010727 if (srelease)
10728 PyMem_FREE(sbuf);
10729 if (release1)
10730 PyMem_FREE(buf1);
10731 if (release2)
10732 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010733 return unicode_result_unchanged(self);
10734
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 error:
10736 if (srelease && sbuf)
10737 PyMem_FREE(sbuf);
10738 if (release1 && buf1)
10739 PyMem_FREE(buf1);
10740 if (release2 && buf2)
10741 PyMem_FREE(buf2);
10742 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010743}
10744
10745/* --- Unicode Object Methods --------------------------------------------- */
10746
INADA Naoki3ae20562017-01-16 20:41:20 +090010747/*[clinic input]
10748str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010749
INADA Naoki3ae20562017-01-16 20:41:20 +090010750Return a version of the string where each word is titlecased.
10751
10752More specifically, words start with uppercased characters and all remaining
10753cased characters have lower case.
10754[clinic start generated code]*/
10755
10756static PyObject *
10757unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010758/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010759{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010760 if (PyUnicode_READY(self) == -1)
10761 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010762 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010763}
10764
INADA Naoki3ae20562017-01-16 20:41:20 +090010765/*[clinic input]
10766str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010767
INADA Naoki3ae20562017-01-16 20:41:20 +090010768Return a capitalized version of the string.
10769
10770More specifically, make the first character have upper case and the rest lower
10771case.
10772[clinic start generated code]*/
10773
10774static PyObject *
10775unicode_capitalize_impl(PyObject *self)
10776/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010777{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010778 if (PyUnicode_READY(self) == -1)
10779 return NULL;
10780 if (PyUnicode_GET_LENGTH(self) == 0)
10781 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010782 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010783}
10784
INADA Naoki3ae20562017-01-16 20:41:20 +090010785/*[clinic input]
10786str.casefold as unicode_casefold
10787
10788Return a version of the string suitable for caseless comparisons.
10789[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010790
10791static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010792unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010793/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010794{
10795 if (PyUnicode_READY(self) == -1)
10796 return NULL;
10797 if (PyUnicode_IS_ASCII(self))
10798 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010799 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010800}
10801
10802
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010803/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010804
10805static int
10806convert_uc(PyObject *obj, void *addr)
10807{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010808 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010809
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010810 if (!PyUnicode_Check(obj)) {
10811 PyErr_Format(PyExc_TypeError,
10812 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010813 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010814 return 0;
10815 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010816 if (PyUnicode_READY(obj) < 0)
10817 return 0;
10818 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010819 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010820 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010821 return 0;
10822 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010823 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010824 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010825}
10826
INADA Naoki3ae20562017-01-16 20:41:20 +090010827/*[clinic input]
10828str.center as unicode_center
10829
10830 width: Py_ssize_t
10831 fillchar: Py_UCS4 = ' '
10832 /
10833
10834Return a centered string of length width.
10835
10836Padding is done using the specified fill character (default is a space).
10837[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010838
10839static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010840unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10841/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010842{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010843 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010844
Benjamin Petersonbac79492012-01-14 13:34:47 -050010845 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010846 return NULL;
10847
Victor Stinnerc4b49542011-12-11 22:44:26 +010010848 if (PyUnicode_GET_LENGTH(self) >= width)
10849 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010850
Victor Stinnerc4b49542011-12-11 22:44:26 +010010851 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010852 left = marg / 2 + (marg & width & 1);
10853
Victor Stinner9310abb2011-10-05 00:59:23 +020010854 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010855}
10856
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010857/* This function assumes that str1 and str2 are readied by the caller. */
10858
Marc-André Lemburge5034372000-08-08 08:04:29 +000010859static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010860unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010861{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010862#define COMPARE(TYPE1, TYPE2) \
10863 do { \
10864 TYPE1* p1 = (TYPE1 *)data1; \
10865 TYPE2* p2 = (TYPE2 *)data2; \
10866 TYPE1* end = p1 + len; \
10867 Py_UCS4 c1, c2; \
10868 for (; p1 != end; p1++, p2++) { \
10869 c1 = *p1; \
10870 c2 = *p2; \
10871 if (c1 != c2) \
10872 return (c1 < c2) ? -1 : 1; \
10873 } \
10874 } \
10875 while (0)
10876
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010877 int kind1, kind2;
10878 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010879 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000010880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010881 kind1 = PyUnicode_KIND(str1);
10882 kind2 = PyUnicode_KIND(str2);
10883 data1 = PyUnicode_DATA(str1);
10884 data2 = PyUnicode_DATA(str2);
10885 len1 = PyUnicode_GET_LENGTH(str1);
10886 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020010887 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000010888
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010889 switch(kind1) {
10890 case PyUnicode_1BYTE_KIND:
10891 {
10892 switch(kind2) {
10893 case PyUnicode_1BYTE_KIND:
10894 {
10895 int cmp = memcmp(data1, data2, len);
10896 /* normalize result of memcmp() into the range [-1; 1] */
10897 if (cmp < 0)
10898 return -1;
10899 if (cmp > 0)
10900 return 1;
10901 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020010902 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010903 case PyUnicode_2BYTE_KIND:
10904 COMPARE(Py_UCS1, Py_UCS2);
10905 break;
10906 case PyUnicode_4BYTE_KIND:
10907 COMPARE(Py_UCS1, Py_UCS4);
10908 break;
10909 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010910 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010911 }
10912 break;
10913 }
10914 case PyUnicode_2BYTE_KIND:
10915 {
10916 switch(kind2) {
10917 case PyUnicode_1BYTE_KIND:
10918 COMPARE(Py_UCS2, Py_UCS1);
10919 break;
10920 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010921 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010922 COMPARE(Py_UCS2, Py_UCS2);
10923 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010924 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010925 case PyUnicode_4BYTE_KIND:
10926 COMPARE(Py_UCS2, Py_UCS4);
10927 break;
10928 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010929 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010930 }
10931 break;
10932 }
10933 case PyUnicode_4BYTE_KIND:
10934 {
10935 switch(kind2) {
10936 case PyUnicode_1BYTE_KIND:
10937 COMPARE(Py_UCS4, Py_UCS1);
10938 break;
10939 case PyUnicode_2BYTE_KIND:
10940 COMPARE(Py_UCS4, Py_UCS2);
10941 break;
10942 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020010943 {
10944#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10945 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10946 /* normalize result of wmemcmp() into the range [-1; 1] */
10947 if (cmp < 0)
10948 return -1;
10949 if (cmp > 0)
10950 return 1;
10951#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010952 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020010953#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010954 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020010955 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010956 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010957 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010958 }
10959 break;
10960 }
10961 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010962 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000010963 }
10964
Victor Stinner770e19e2012-10-04 22:59:45 +020010965 if (len1 == len2)
10966 return 0;
10967 if (len1 < len2)
10968 return -1;
10969 else
10970 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010971
10972#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000010973}
10974
Benjamin Peterson621b4302016-09-09 13:54:34 -070010975static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020010976unicode_compare_eq(PyObject *str1, PyObject *str2)
10977{
10978 int kind;
10979 void *data1, *data2;
10980 Py_ssize_t len;
10981 int cmp;
10982
Victor Stinnere5567ad2012-10-23 02:48:49 +020010983 len = PyUnicode_GET_LENGTH(str1);
10984 if (PyUnicode_GET_LENGTH(str2) != len)
10985 return 0;
10986 kind = PyUnicode_KIND(str1);
10987 if (PyUnicode_KIND(str2) != kind)
10988 return 0;
10989 data1 = PyUnicode_DATA(str1);
10990 data2 = PyUnicode_DATA(str2);
10991
10992 cmp = memcmp(data1, data2, len * kind);
10993 return (cmp == 0);
10994}
10995
10996
Alexander Belopolsky40018472011-02-26 01:02:56 +000010997int
10998PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010999{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011000 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11001 if (PyUnicode_READY(left) == -1 ||
11002 PyUnicode_READY(right) == -1)
11003 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011004
11005 /* a string is equal to itself */
11006 if (left == right)
11007 return 0;
11008
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011009 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011010 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011011 PyErr_Format(PyExc_TypeError,
11012 "Can't compare %.100s and %.100s",
11013 left->ob_type->tp_name,
11014 right->ob_type->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011015 return -1;
11016}
11017
Martin v. Löwis5b222132007-06-10 09:51:05 +000011018int
11019PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011021 Py_ssize_t i;
11022 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011023 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011024 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011025
Victor Stinner910337b2011-10-03 03:20:16 +020011026 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011027 if (!PyUnicode_IS_READY(uni)) {
11028 const wchar_t *ws = _PyUnicode_WSTR(uni);
11029 /* Compare Unicode string and source character set string */
11030 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11031 if (chr != ustr[i])
11032 return (chr < ustr[i]) ? -1 : 1;
11033 }
11034 /* This check keeps Python strings that end in '\0' from comparing equal
11035 to C strings identical up to that point. */
11036 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11037 return 1; /* uni is longer */
11038 if (ustr[i])
11039 return -1; /* str is longer */
11040 return 0;
11041 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011042 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011043 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011044 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011045 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011046 size_t len, len2 = strlen(str);
11047 int cmp;
11048
11049 len = Py_MIN(len1, len2);
11050 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011051 if (cmp != 0) {
11052 if (cmp < 0)
11053 return -1;
11054 else
11055 return 1;
11056 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011057 if (len1 > len2)
11058 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011059 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011060 return -1; /* str is longer */
11061 return 0;
11062 }
11063 else {
11064 void *data = PyUnicode_DATA(uni);
11065 /* Compare Unicode string and source character set string */
11066 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011067 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011068 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11069 /* This check keeps Python strings that end in '\0' from comparing equal
11070 to C strings identical up to that point. */
11071 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11072 return 1; /* uni is longer */
11073 if (str[i])
11074 return -1; /* str is longer */
11075 return 0;
11076 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011077}
11078
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011079static int
11080non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11081{
11082 size_t i, len;
11083 const wchar_t *p;
11084 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11085 if (strlen(str) != len)
11086 return 0;
11087 p = _PyUnicode_WSTR(unicode);
11088 assert(p);
11089 for (i = 0; i < len; i++) {
11090 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011091 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011092 return 0;
11093 }
11094 return 1;
11095}
11096
11097int
11098_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11099{
11100 size_t len;
11101 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011102 assert(str);
11103#ifndef NDEBUG
11104 for (const char *p = str; *p; p++) {
11105 assert((unsigned char)*p < 128);
11106 }
11107#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011108 if (PyUnicode_READY(unicode) == -1) {
11109 /* Memory error or bad data */
11110 PyErr_Clear();
11111 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11112 }
11113 if (!PyUnicode_IS_ASCII(unicode))
11114 return 0;
11115 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11116 return strlen(str) == len &&
11117 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11118}
11119
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011120int
11121_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11122{
11123 PyObject *right_uni;
11124 Py_hash_t hash;
11125
11126 assert(_PyUnicode_CHECK(left));
11127 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011128#ifndef NDEBUG
11129 for (const char *p = right->string; *p; p++) {
11130 assert((unsigned char)*p < 128);
11131 }
11132#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011133
11134 if (PyUnicode_READY(left) == -1) {
11135 /* memory error or bad data */
11136 PyErr_Clear();
11137 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11138 }
11139
11140 if (!PyUnicode_IS_ASCII(left))
11141 return 0;
11142
11143 right_uni = _PyUnicode_FromId(right); /* borrowed */
11144 if (right_uni == NULL) {
11145 /* memory error or bad data */
11146 PyErr_Clear();
11147 return _PyUnicode_EqualToASCIIString(left, right->string);
11148 }
11149
11150 if (left == right_uni)
11151 return 1;
11152
11153 if (PyUnicode_CHECK_INTERNED(left))
11154 return 0;
11155
INADA Naoki7cc95f52018-01-28 02:07:09 +090011156 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011157 hash = _PyUnicode_HASH(left);
11158 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11159 return 0;
11160
11161 return unicode_compare_eq(left, right_uni);
11162}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011163
Alexander Belopolsky40018472011-02-26 01:02:56 +000011164PyObject *
11165PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011166{
11167 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011168
Victor Stinnere5567ad2012-10-23 02:48:49 +020011169 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11170 Py_RETURN_NOTIMPLEMENTED;
11171
11172 if (PyUnicode_READY(left) == -1 ||
11173 PyUnicode_READY(right) == -1)
11174 return NULL;
11175
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011176 if (left == right) {
11177 switch (op) {
11178 case Py_EQ:
11179 case Py_LE:
11180 case Py_GE:
11181 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011182 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011183 case Py_NE:
11184 case Py_LT:
11185 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011186 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011187 default:
11188 PyErr_BadArgument();
11189 return NULL;
11190 }
11191 }
11192 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011193 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011194 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011195 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011196 }
11197 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011198 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011199 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011200 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011201}
11202
Alexander Belopolsky40018472011-02-26 01:02:56 +000011203int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011204_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11205{
11206 return unicode_eq(aa, bb);
11207}
11208
11209int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011210PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011211{
Victor Stinner77282cb2013-04-14 19:22:47 +020011212 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011213 void *buf1, *buf2;
11214 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011215 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011216
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011217 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011218 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011219 "'in <string>' requires string as left operand, not %.100s",
11220 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011221 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011222 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011223 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011224 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011225 if (ensure_unicode(str) < 0)
11226 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011227
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011228 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011229 kind2 = PyUnicode_KIND(substr);
11230 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011231 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011232 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011233 len2 = PyUnicode_GET_LENGTH(substr);
11234 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011235 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011236 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011237 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011238 if (len2 == 1) {
11239 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11240 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011241 return result;
11242 }
11243 if (kind2 != kind1) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011244 buf2 = _PyUnicode_AsKind(substr, kind1);
11245 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011246 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011248
Victor Stinner77282cb2013-04-14 19:22:47 +020011249 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011250 case PyUnicode_1BYTE_KIND:
11251 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11252 break;
11253 case PyUnicode_2BYTE_KIND:
11254 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11255 break;
11256 case PyUnicode_4BYTE_KIND:
11257 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11258 break;
11259 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011260 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011261 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011262
Victor Stinner77282cb2013-04-14 19:22:47 +020011263 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011264 PyMem_Free(buf2);
11265
Guido van Rossum403d68b2000-03-13 15:55:09 +000011266 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011267}
11268
Guido van Rossumd57fd912000-03-10 22:53:23 +000011269/* Concat to string or Unicode object giving a new Unicode object. */
11270
Alexander Belopolsky40018472011-02-26 01:02:56 +000011271PyObject *
11272PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011273{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011274 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011275 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011276 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011277
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011278 if (ensure_unicode(left) < 0)
11279 return NULL;
11280
11281 if (!PyUnicode_Check(right)) {
11282 PyErr_Format(PyExc_TypeError,
11283 "can only concatenate str (not \"%.200s\") to str",
11284 right->ob_type->tp_name);
11285 return NULL;
11286 }
11287 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011288 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011289
11290 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011291 if (left == unicode_empty)
11292 return PyUnicode_FromObject(right);
11293 if (right == unicode_empty)
11294 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011295
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011296 left_len = PyUnicode_GET_LENGTH(left);
11297 right_len = PyUnicode_GET_LENGTH(right);
11298 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011299 PyErr_SetString(PyExc_OverflowError,
11300 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011301 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011302 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011303 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011304
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011305 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11306 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011307 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011308
Guido van Rossumd57fd912000-03-10 22:53:23 +000011309 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011310 result = PyUnicode_New(new_len, maxchar);
11311 if (result == NULL)
11312 return NULL;
11313 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11314 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11315 assert(_PyUnicode_CheckConsistency(result, 1));
11316 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011317}
11318
Walter Dörwald1ab83302007-05-18 17:15:44 +000011319void
Victor Stinner23e56682011-10-03 03:54:37 +020011320PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011321{
Victor Stinner23e56682011-10-03 03:54:37 +020011322 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011323 Py_UCS4 maxchar, maxchar2;
11324 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011325
11326 if (p_left == NULL) {
11327 if (!PyErr_Occurred())
11328 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011329 return;
11330 }
Victor Stinner23e56682011-10-03 03:54:37 +020011331 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011332 if (right == NULL || left == NULL
11333 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011334 if (!PyErr_Occurred())
11335 PyErr_BadInternalCall();
11336 goto error;
11337 }
11338
Benjamin Petersonbac79492012-01-14 13:34:47 -050011339 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011340 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011341 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011342 goto error;
11343
Victor Stinner488fa492011-12-12 00:01:39 +010011344 /* Shortcuts */
11345 if (left == unicode_empty) {
11346 Py_DECREF(left);
11347 Py_INCREF(right);
11348 *p_left = right;
11349 return;
11350 }
11351 if (right == unicode_empty)
11352 return;
11353
11354 left_len = PyUnicode_GET_LENGTH(left);
11355 right_len = PyUnicode_GET_LENGTH(right);
11356 if (left_len > PY_SSIZE_T_MAX - right_len) {
11357 PyErr_SetString(PyExc_OverflowError,
11358 "strings are too large to concat");
11359 goto error;
11360 }
11361 new_len = left_len + right_len;
11362
11363 if (unicode_modifiable(left)
11364 && PyUnicode_CheckExact(right)
11365 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011366 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11367 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011368 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011369 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011370 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11371 {
11372 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011373 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011374 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011375
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011376 /* copy 'right' into the newly allocated area of 'left' */
11377 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011378 }
Victor Stinner488fa492011-12-12 00:01:39 +010011379 else {
11380 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11381 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011382 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011383
Victor Stinner488fa492011-12-12 00:01:39 +010011384 /* Concat the two Unicode strings */
11385 res = PyUnicode_New(new_len, maxchar);
11386 if (res == NULL)
11387 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011388 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11389 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011390 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011391 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011392 }
11393 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011394 return;
11395
11396error:
Victor Stinner488fa492011-12-12 00:01:39 +010011397 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011398}
11399
11400void
11401PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11402{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011403 PyUnicode_Append(pleft, right);
11404 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011405}
11406
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011407/*
11408Wraps stringlib_parse_args_finds() and additionally ensures that the
11409first argument is a unicode object.
11410*/
11411
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011412static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011413parse_args_finds_unicode(const char * function_name, PyObject *args,
11414 PyObject **substring,
11415 Py_ssize_t *start, Py_ssize_t *end)
11416{
11417 if(stringlib_parse_args_finds(function_name, args, substring,
11418 start, end)) {
11419 if (ensure_unicode(*substring) < 0)
11420 return 0;
11421 return 1;
11422 }
11423 return 0;
11424}
11425
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011426PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011427 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011428\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011429Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011430string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011431interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011432
11433static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011434unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011435{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011436 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011437 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011438 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011439 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011440 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011441 void *buf1, *buf2;
11442 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011443
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011444 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011445 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011446
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011447 kind1 = PyUnicode_KIND(self);
11448 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011449 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011450 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011451
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011452 len1 = PyUnicode_GET_LENGTH(self);
11453 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011454 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011455 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011456 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011457
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011458 buf1 = PyUnicode_DATA(self);
11459 buf2 = PyUnicode_DATA(substring);
11460 if (kind2 != kind1) {
11461 buf2 = _PyUnicode_AsKind(substring, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011462 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011463 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011464 }
11465 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011466 case PyUnicode_1BYTE_KIND:
11467 iresult = ucs1lib_count(
11468 ((Py_UCS1*)buf1) + start, end - start,
11469 buf2, len2, PY_SSIZE_T_MAX
11470 );
11471 break;
11472 case PyUnicode_2BYTE_KIND:
11473 iresult = ucs2lib_count(
11474 ((Py_UCS2*)buf1) + start, end - start,
11475 buf2, len2, PY_SSIZE_T_MAX
11476 );
11477 break;
11478 case PyUnicode_4BYTE_KIND:
11479 iresult = ucs4lib_count(
11480 ((Py_UCS4*)buf1) + start, end - start,
11481 buf2, len2, PY_SSIZE_T_MAX
11482 );
11483 break;
11484 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011485 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011486 }
11487
11488 result = PyLong_FromSsize_t(iresult);
11489
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011490 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011491 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011492
Guido van Rossumd57fd912000-03-10 22:53:23 +000011493 return result;
11494}
11495
INADA Naoki3ae20562017-01-16 20:41:20 +090011496/*[clinic input]
11497str.encode as unicode_encode
11498
11499 encoding: str(c_default="NULL") = 'utf-8'
11500 The encoding in which to encode the string.
11501 errors: str(c_default="NULL") = 'strict'
11502 The error handling scheme to use for encoding errors.
11503 The default is 'strict' meaning that encoding errors raise a
11504 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11505 'xmlcharrefreplace' as well as any other name registered with
11506 codecs.register_error that can handle UnicodeEncodeErrors.
11507
11508Encode the string using the codec registered for encoding.
11509[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011510
11511static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011512unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011513/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011514{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011515 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011516}
11517
INADA Naoki3ae20562017-01-16 20:41:20 +090011518/*[clinic input]
11519str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011520
INADA Naoki3ae20562017-01-16 20:41:20 +090011521 tabsize: int = 8
11522
11523Return a copy where all tab characters are expanded using spaces.
11524
11525If tabsize is not given, a tab size of 8 characters is assumed.
11526[clinic start generated code]*/
11527
11528static PyObject *
11529unicode_expandtabs_impl(PyObject *self, int tabsize)
11530/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011531{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011532 Py_ssize_t i, j, line_pos, src_len, incr;
11533 Py_UCS4 ch;
11534 PyObject *u;
11535 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011536 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011537 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011538
Antoine Pitrou22425222011-10-04 19:10:51 +020011539 if (PyUnicode_READY(self) == -1)
11540 return NULL;
11541
Thomas Wouters7e474022000-07-16 12:04:32 +000011542 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011543 src_len = PyUnicode_GET_LENGTH(self);
11544 i = j = line_pos = 0;
11545 kind = PyUnicode_KIND(self);
11546 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011547 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011548 for (; i < src_len; i++) {
11549 ch = PyUnicode_READ(kind, src_data, i);
11550 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011551 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011552 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011553 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011554 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011555 goto overflow;
11556 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011557 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011558 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011559 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011560 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011561 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011562 goto overflow;
11563 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011564 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011565 if (ch == '\n' || ch == '\r')
11566 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011568 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011569 if (!found)
11570 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011571
Guido van Rossumd57fd912000-03-10 22:53:23 +000011572 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011573 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011574 if (!u)
11575 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011576 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011577
Antoine Pitroue71d5742011-10-04 15:55:09 +020011578 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011579
Antoine Pitroue71d5742011-10-04 15:55:09 +020011580 for (; i < src_len; i++) {
11581 ch = PyUnicode_READ(kind, src_data, i);
11582 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011583 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011584 incr = tabsize - (line_pos % tabsize);
11585 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011586 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011587 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011588 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011589 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011590 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011591 line_pos++;
11592 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011593 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011594 if (ch == '\n' || ch == '\r')
11595 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011596 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011597 }
11598 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011599 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011600
Antoine Pitroue71d5742011-10-04 15:55:09 +020011601 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011602 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11603 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011604}
11605
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011606PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011607 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011608\n\
11609Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011610such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011611arguments start and end are interpreted as in slice notation.\n\
11612\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011613Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011614
11615static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011616unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011618 /* initialize variables to prevent gcc warning */
11619 PyObject *substring = NULL;
11620 Py_ssize_t start = 0;
11621 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011622 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011623
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011624 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011625 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011626
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011627 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011628 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011629
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011630 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011631
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011632 if (result == -2)
11633 return NULL;
11634
Christian Heimes217cfd12007-12-02 14:31:20 +000011635 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011636}
11637
11638static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011639unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011640{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011641 void *data;
11642 enum PyUnicode_Kind kind;
11643 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011644
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011645 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011646 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011647 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011648 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011649 if (PyUnicode_READY(self) == -1) {
11650 return NULL;
11651 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011652 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11653 PyErr_SetString(PyExc_IndexError, "string index out of range");
11654 return NULL;
11655 }
11656 kind = PyUnicode_KIND(self);
11657 data = PyUnicode_DATA(self);
11658 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011659 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011660}
11661
Guido van Rossumc2504932007-09-18 19:42:40 +000011662/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011663 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011664static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011665unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011666{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011667 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011668
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011669#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011670 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011671#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011672 if (_PyUnicode_HASH(self) != -1)
11673 return _PyUnicode_HASH(self);
11674 if (PyUnicode_READY(self) == -1)
11675 return -1;
animalizea1d14252019-01-02 20:16:06 +080011676
Christian Heimes985ecdc2013-11-20 11:46:18 +010011677 x = _Py_HashBytes(PyUnicode_DATA(self),
11678 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011679 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011680 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011681}
11682
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011683PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011684 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011685\n\
oldkaa0735f2018-02-02 16:52:55 +080011686Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011687such that sub is contained within S[start:end]. Optional\n\
11688arguments start and end are interpreted as in slice notation.\n\
11689\n\
11690Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691
11692static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011693unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011694{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011695 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011696 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011697 PyObject *substring = NULL;
11698 Py_ssize_t start = 0;
11699 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011700
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011701 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011702 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011704 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011705 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011706
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011707 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011708
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011709 if (result == -2)
11710 return NULL;
11711
Guido van Rossumd57fd912000-03-10 22:53:23 +000011712 if (result < 0) {
11713 PyErr_SetString(PyExc_ValueError, "substring not found");
11714 return NULL;
11715 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011716
Christian Heimes217cfd12007-12-02 14:31:20 +000011717 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011718}
11719
INADA Naoki3ae20562017-01-16 20:41:20 +090011720/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011721str.isascii as unicode_isascii
11722
11723Return True if all characters in the string are ASCII, False otherwise.
11724
11725ASCII characters have code points in the range U+0000-U+007F.
11726Empty string is ASCII too.
11727[clinic start generated code]*/
11728
11729static PyObject *
11730unicode_isascii_impl(PyObject *self)
11731/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11732{
11733 if (PyUnicode_READY(self) == -1) {
11734 return NULL;
11735 }
11736 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11737}
11738
11739/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011740str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741
INADA Naoki3ae20562017-01-16 20:41:20 +090011742Return True if the string is a lowercase string, False otherwise.
11743
11744A string is lowercase if all cased characters in the string are lowercase and
11745there is at least one cased character in the string.
11746[clinic start generated code]*/
11747
11748static PyObject *
11749unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011750/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011751{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 Py_ssize_t i, length;
11753 int kind;
11754 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011755 int cased;
11756
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011757 if (PyUnicode_READY(self) == -1)
11758 return NULL;
11759 length = PyUnicode_GET_LENGTH(self);
11760 kind = PyUnicode_KIND(self);
11761 data = PyUnicode_DATA(self);
11762
Guido van Rossumd57fd912000-03-10 22:53:23 +000011763 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011764 if (length == 1)
11765 return PyBool_FromLong(
11766 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011767
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011768 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011769 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011770 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011771
Guido van Rossumd57fd912000-03-10 22:53:23 +000011772 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011773 for (i = 0; i < length; i++) {
11774 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011775
Benjamin Peterson29060642009-01-31 22:14:21 +000011776 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011777 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011778 else if (!cased && Py_UNICODE_ISLOWER(ch))
11779 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011780 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011781 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011782}
11783
INADA Naoki3ae20562017-01-16 20:41:20 +090011784/*[clinic input]
11785str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011786
INADA Naoki3ae20562017-01-16 20:41:20 +090011787Return True if the string is an uppercase string, False otherwise.
11788
11789A string is uppercase if all cased characters in the string are uppercase and
11790there is at least one cased character in the string.
11791[clinic start generated code]*/
11792
11793static PyObject *
11794unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011795/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011796{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011797 Py_ssize_t i, length;
11798 int kind;
11799 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011800 int cased;
11801
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011802 if (PyUnicode_READY(self) == -1)
11803 return NULL;
11804 length = PyUnicode_GET_LENGTH(self);
11805 kind = PyUnicode_KIND(self);
11806 data = PyUnicode_DATA(self);
11807
Guido van Rossumd57fd912000-03-10 22:53:23 +000011808 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011809 if (length == 1)
11810 return PyBool_FromLong(
11811 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011812
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011813 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011814 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011815 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011816
Guido van Rossumd57fd912000-03-10 22:53:23 +000011817 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011818 for (i = 0; i < length; i++) {
11819 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011820
Benjamin Peterson29060642009-01-31 22:14:21 +000011821 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011822 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011823 else if (!cased && Py_UNICODE_ISUPPER(ch))
11824 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011825 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011826 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827}
11828
INADA Naoki3ae20562017-01-16 20:41:20 +090011829/*[clinic input]
11830str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011831
INADA Naoki3ae20562017-01-16 20:41:20 +090011832Return True if the string is a title-cased string, False otherwise.
11833
11834In a title-cased string, upper- and title-case characters may only
11835follow uncased characters and lowercase characters only cased ones.
11836[clinic start generated code]*/
11837
11838static PyObject *
11839unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011840/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011841{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011842 Py_ssize_t i, length;
11843 int kind;
11844 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011845 int cased, previous_is_cased;
11846
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011847 if (PyUnicode_READY(self) == -1)
11848 return NULL;
11849 length = PyUnicode_GET_LENGTH(self);
11850 kind = PyUnicode_KIND(self);
11851 data = PyUnicode_DATA(self);
11852
Guido van Rossumd57fd912000-03-10 22:53:23 +000011853 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011854 if (length == 1) {
11855 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11856 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11857 (Py_UNICODE_ISUPPER(ch) != 0));
11858 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011859
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011860 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011861 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011862 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011863
Guido van Rossumd57fd912000-03-10 22:53:23 +000011864 cased = 0;
11865 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011866 for (i = 0; i < length; i++) {
11867 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011868
Benjamin Peterson29060642009-01-31 22:14:21 +000011869 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11870 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011871 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011872 previous_is_cased = 1;
11873 cased = 1;
11874 }
11875 else if (Py_UNICODE_ISLOWER(ch)) {
11876 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011877 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011878 previous_is_cased = 1;
11879 cased = 1;
11880 }
11881 else
11882 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011883 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011884 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011885}
11886
INADA Naoki3ae20562017-01-16 20:41:20 +090011887/*[clinic input]
11888str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000011889
INADA Naoki3ae20562017-01-16 20:41:20 +090011890Return True if the string is a whitespace string, False otherwise.
11891
11892A string is whitespace if all characters in the string are whitespace and there
11893is at least one character in the string.
11894[clinic start generated code]*/
11895
11896static PyObject *
11897unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011898/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011899{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011900 Py_ssize_t i, length;
11901 int kind;
11902 void *data;
11903
11904 if (PyUnicode_READY(self) == -1)
11905 return NULL;
11906 length = PyUnicode_GET_LENGTH(self);
11907 kind = PyUnicode_KIND(self);
11908 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011909
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011911 if (length == 1)
11912 return PyBool_FromLong(
11913 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011914
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011915 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011916 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011917 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011918
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011919 for (i = 0; i < length; i++) {
11920 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030011921 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011922 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011923 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011924 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011925}
11926
INADA Naoki3ae20562017-01-16 20:41:20 +090011927/*[clinic input]
11928str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011929
INADA Naoki3ae20562017-01-16 20:41:20 +090011930Return True if the string is an alphabetic string, False otherwise.
11931
11932A string is alphabetic if all characters in the string are alphabetic and there
11933is at least one character in the string.
11934[clinic start generated code]*/
11935
11936static PyObject *
11937unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011938/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011939{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011940 Py_ssize_t i, length;
11941 int kind;
11942 void *data;
11943
11944 if (PyUnicode_READY(self) == -1)
11945 return NULL;
11946 length = PyUnicode_GET_LENGTH(self);
11947 kind = PyUnicode_KIND(self);
11948 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011949
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011950 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011951 if (length == 1)
11952 return PyBool_FromLong(
11953 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011954
11955 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011956 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011957 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011958
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011959 for (i = 0; i < length; i++) {
11960 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011961 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011962 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011963 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011964}
11965
INADA Naoki3ae20562017-01-16 20:41:20 +090011966/*[clinic input]
11967str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011968
INADA Naoki3ae20562017-01-16 20:41:20 +090011969Return True if the string is an alpha-numeric string, False otherwise.
11970
11971A string is alpha-numeric if all characters in the string are alpha-numeric and
11972there is at least one character in the string.
11973[clinic start generated code]*/
11974
11975static PyObject *
11976unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011977/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011978{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011979 int kind;
11980 void *data;
11981 Py_ssize_t len, i;
11982
11983 if (PyUnicode_READY(self) == -1)
11984 return NULL;
11985
11986 kind = PyUnicode_KIND(self);
11987 data = PyUnicode_DATA(self);
11988 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011989
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011990 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011991 if (len == 1) {
11992 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11993 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11994 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011995
11996 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011997 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011998 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000011999
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012000 for (i = 0; i < len; i++) {
12001 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012002 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012003 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012004 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012005 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012006}
12007
INADA Naoki3ae20562017-01-16 20:41:20 +090012008/*[clinic input]
12009str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012010
INADA Naoki3ae20562017-01-16 20:41:20 +090012011Return True if the string is a decimal string, False otherwise.
12012
12013A string is a decimal string if all characters in the string are decimal and
12014there is at least one character in the string.
12015[clinic start generated code]*/
12016
12017static PyObject *
12018unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012019/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012020{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012021 Py_ssize_t i, length;
12022 int kind;
12023 void *data;
12024
12025 if (PyUnicode_READY(self) == -1)
12026 return NULL;
12027 length = PyUnicode_GET_LENGTH(self);
12028 kind = PyUnicode_KIND(self);
12029 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012030
Guido van Rossumd57fd912000-03-10 22:53:23 +000012031 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012032 if (length == 1)
12033 return PyBool_FromLong(
12034 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012035
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012036 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012037 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012038 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012039
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 for (i = 0; i < length; i++) {
12041 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012042 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012043 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012044 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012045}
12046
INADA Naoki3ae20562017-01-16 20:41:20 +090012047/*[clinic input]
12048str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049
INADA Naoki3ae20562017-01-16 20:41:20 +090012050Return True if the string is a digit string, False otherwise.
12051
12052A string is a digit string if all characters in the string are digits and there
12053is at least one character in the string.
12054[clinic start generated code]*/
12055
12056static PyObject *
12057unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012058/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012059{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012060 Py_ssize_t i, length;
12061 int kind;
12062 void *data;
12063
12064 if (PyUnicode_READY(self) == -1)
12065 return NULL;
12066 length = PyUnicode_GET_LENGTH(self);
12067 kind = PyUnicode_KIND(self);
12068 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012069
Guido van Rossumd57fd912000-03-10 22:53:23 +000012070 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012071 if (length == 1) {
12072 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12073 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12074 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012075
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012076 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012077 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012078 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012079
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 for (i = 0; i < length; i++) {
12081 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012082 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012083 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012084 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012085}
12086
INADA Naoki3ae20562017-01-16 20:41:20 +090012087/*[clinic input]
12088str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012089
INADA Naoki3ae20562017-01-16 20:41:20 +090012090Return True if the string is a numeric string, False otherwise.
12091
12092A string is numeric if all characters in the string are numeric and there is at
12093least one character in the string.
12094[clinic start generated code]*/
12095
12096static PyObject *
12097unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012098/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012099{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012100 Py_ssize_t i, length;
12101 int kind;
12102 void *data;
12103
12104 if (PyUnicode_READY(self) == -1)
12105 return NULL;
12106 length = PyUnicode_GET_LENGTH(self);
12107 kind = PyUnicode_KIND(self);
12108 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012109
Guido van Rossumd57fd912000-03-10 22:53:23 +000012110 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012111 if (length == 1)
12112 return PyBool_FromLong(
12113 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012114
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012115 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012116 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012117 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012118
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012119 for (i = 0; i < length; i++) {
12120 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012121 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012122 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012123 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012124}
12125
Martin v. Löwis47383402007-08-15 07:32:56 +000012126int
12127PyUnicode_IsIdentifier(PyObject *self)
12128{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012129 int kind;
12130 void *data;
12131 Py_ssize_t i;
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012132 Py_UCS4 first;
Martin v. Löwis47383402007-08-15 07:32:56 +000012133
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012134 if (PyUnicode_READY(self) == -1) {
12135 Py_FatalError("identifier not ready");
Benjamin Peterson29060642009-01-31 22:14:21 +000012136 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012137 }
12138
12139 /* Special case for empty strings */
12140 if (PyUnicode_GET_LENGTH(self) == 0)
12141 return 0;
12142 kind = PyUnicode_KIND(self);
12143 data = PyUnicode_DATA(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012144
12145 /* PEP 3131 says that the first character must be in
12146 XID_Start and subsequent characters in XID_Continue,
12147 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012148 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012149 letters, digits, underscore). However, given the current
12150 definition of XID_Start and XID_Continue, it is sufficient
12151 to check just for these, except that _ must be allowed
12152 as starting an identifier. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012153 first = PyUnicode_READ(kind, data, 0);
Benjamin Petersonf413b802011-08-12 22:17:18 -050012154 if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
Martin v. Löwis47383402007-08-15 07:32:56 +000012155 return 0;
12156
Benjamin Peterson9c6e6a02011-09-28 08:09:05 -040012157 for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012158 if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
Benjamin Peterson29060642009-01-31 22:14:21 +000012159 return 0;
Martin v. Löwis47383402007-08-15 07:32:56 +000012160 return 1;
12161}
12162
INADA Naoki3ae20562017-01-16 20:41:20 +090012163/*[clinic input]
12164str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012165
INADA Naoki3ae20562017-01-16 20:41:20 +090012166Return True if the string is a valid Python identifier, False otherwise.
12167
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012168Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012169such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012170[clinic start generated code]*/
12171
12172static PyObject *
12173unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012174/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012175{
12176 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12177}
12178
INADA Naoki3ae20562017-01-16 20:41:20 +090012179/*[clinic input]
12180str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012181
INADA Naoki3ae20562017-01-16 20:41:20 +090012182Return True if the string is printable, False otherwise.
12183
12184A string is printable if all of its characters are considered printable in
12185repr() or if it is empty.
12186[clinic start generated code]*/
12187
12188static PyObject *
12189unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012190/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012191{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012192 Py_ssize_t i, length;
12193 int kind;
12194 void *data;
12195
12196 if (PyUnicode_READY(self) == -1)
12197 return NULL;
12198 length = PyUnicode_GET_LENGTH(self);
12199 kind = PyUnicode_KIND(self);
12200 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012201
12202 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012203 if (length == 1)
12204 return PyBool_FromLong(
12205 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012206
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012207 for (i = 0; i < length; i++) {
12208 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012209 Py_RETURN_FALSE;
12210 }
12211 }
12212 Py_RETURN_TRUE;
12213}
12214
INADA Naoki3ae20562017-01-16 20:41:20 +090012215/*[clinic input]
12216str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012217
INADA Naoki3ae20562017-01-16 20:41:20 +090012218 iterable: object
12219 /
12220
12221Concatenate any number of strings.
12222
Martin Panter91a88662017-01-24 00:30:06 +000012223The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012224The result is returned as a new string.
12225
12226Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12227[clinic start generated code]*/
12228
12229static PyObject *
12230unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012231/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012232{
INADA Naoki3ae20562017-01-16 20:41:20 +090012233 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234}
12235
Martin v. Löwis18e16552006-02-15 17:27:45 +000012236static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012237unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012239 if (PyUnicode_READY(self) == -1)
12240 return -1;
12241 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012242}
12243
INADA Naoki3ae20562017-01-16 20:41:20 +090012244/*[clinic input]
12245str.ljust as unicode_ljust
12246
12247 width: Py_ssize_t
12248 fillchar: Py_UCS4 = ' '
12249 /
12250
12251Return a left-justified string of length width.
12252
12253Padding is done using the specified fill character (default is a space).
12254[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012255
12256static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012257unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12258/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012259{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012260 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012261 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012262
Victor Stinnerc4b49542011-12-11 22:44:26 +010012263 if (PyUnicode_GET_LENGTH(self) >= width)
12264 return unicode_result_unchanged(self);
12265
12266 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012267}
12268
INADA Naoki3ae20562017-01-16 20:41:20 +090012269/*[clinic input]
12270str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012271
INADA Naoki3ae20562017-01-16 20:41:20 +090012272Return a copy of the string converted to lowercase.
12273[clinic start generated code]*/
12274
12275static PyObject *
12276unicode_lower_impl(PyObject *self)
12277/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012278{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012279 if (PyUnicode_READY(self) == -1)
12280 return NULL;
12281 if (PyUnicode_IS_ASCII(self))
12282 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012283 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012284}
12285
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012286#define LEFTSTRIP 0
12287#define RIGHTSTRIP 1
12288#define BOTHSTRIP 2
12289
12290/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012291static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012292
INADA Naoki3ae20562017-01-16 20:41:20 +090012293#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012294
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012295/* externally visible for str.strip(unicode) */
12296PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012297_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012298{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012299 void *data;
12300 int kind;
12301 Py_ssize_t i, j, len;
12302 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012303 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012304
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012305 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12306 return NULL;
12307
12308 kind = PyUnicode_KIND(self);
12309 data = PyUnicode_DATA(self);
12310 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012311 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012312 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12313 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012314 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012315
Benjamin Peterson14339b62009-01-31 16:36:08 +000012316 i = 0;
12317 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012318 while (i < len) {
12319 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12320 if (!BLOOM(sepmask, ch))
12321 break;
12322 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12323 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012324 i++;
12325 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012326 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012327
Benjamin Peterson14339b62009-01-31 16:36:08 +000012328 j = len;
12329 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012330 j--;
12331 while (j >= i) {
12332 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12333 if (!BLOOM(sepmask, ch))
12334 break;
12335 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12336 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012337 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012338 }
12339
Benjamin Peterson29060642009-01-31 22:14:21 +000012340 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012341 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012342
Victor Stinner7931d9a2011-11-04 00:22:48 +010012343 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012344}
12345
12346PyObject*
12347PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12348{
12349 unsigned char *data;
12350 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012351 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012352
Victor Stinnerde636f32011-10-01 03:55:54 +020012353 if (PyUnicode_READY(self) == -1)
12354 return NULL;
12355
Victor Stinner684d5fd2012-05-03 02:32:34 +020012356 length = PyUnicode_GET_LENGTH(self);
12357 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012358
Victor Stinner684d5fd2012-05-03 02:32:34 +020012359 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012360 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012361
Victor Stinnerde636f32011-10-01 03:55:54 +020012362 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012363 PyErr_SetString(PyExc_IndexError, "string index out of range");
12364 return NULL;
12365 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012366 if (start >= length || end < start)
12367 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012368
Victor Stinner684d5fd2012-05-03 02:32:34 +020012369 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012370 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012371 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012372 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012373 }
12374 else {
12375 kind = PyUnicode_KIND(self);
12376 data = PyUnicode_1BYTE_DATA(self);
12377 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012378 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012379 length);
12380 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012381}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012382
12383static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012384do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012385{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012386 Py_ssize_t len, i, j;
12387
12388 if (PyUnicode_READY(self) == -1)
12389 return NULL;
12390
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012391 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012392
Victor Stinnercc7af722013-04-09 22:39:24 +020012393 if (PyUnicode_IS_ASCII(self)) {
12394 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12395
12396 i = 0;
12397 if (striptype != RIGHTSTRIP) {
12398 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012399 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012400 if (!_Py_ascii_whitespace[ch])
12401 break;
12402 i++;
12403 }
12404 }
12405
12406 j = len;
12407 if (striptype != LEFTSTRIP) {
12408 j--;
12409 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012410 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012411 if (!_Py_ascii_whitespace[ch])
12412 break;
12413 j--;
12414 }
12415 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012416 }
12417 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012418 else {
12419 int kind = PyUnicode_KIND(self);
12420 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012421
Victor Stinnercc7af722013-04-09 22:39:24 +020012422 i = 0;
12423 if (striptype != RIGHTSTRIP) {
12424 while (i < len) {
12425 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12426 if (!Py_UNICODE_ISSPACE(ch))
12427 break;
12428 i++;
12429 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012430 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012431
12432 j = len;
12433 if (striptype != LEFTSTRIP) {
12434 j--;
12435 while (j >= i) {
12436 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12437 if (!Py_UNICODE_ISSPACE(ch))
12438 break;
12439 j--;
12440 }
12441 j++;
12442 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012443 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012444
Victor Stinner7931d9a2011-11-04 00:22:48 +010012445 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012446}
12447
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012448
12449static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012450do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012451{
Benjamin Peterson14339b62009-01-31 16:36:08 +000012452 if (sep != NULL && sep != Py_None) {
12453 if (PyUnicode_Check(sep))
12454 return _PyUnicode_XStrip(self, striptype, sep);
12455 else {
12456 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012457 "%s arg must be None or str",
12458 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012459 return NULL;
12460 }
12461 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012462
Benjamin Peterson14339b62009-01-31 16:36:08 +000012463 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012464}
12465
12466
INADA Naoki3ae20562017-01-16 20:41:20 +090012467/*[clinic input]
12468str.strip as unicode_strip
12469
12470 chars: object = None
12471 /
12472
Victor Stinner0c4a8282017-01-17 02:21:47 +010012473Return a copy of the string with leading and trailing whitespace remove.
INADA Naoki3ae20562017-01-16 20:41:20 +090012474
12475If chars is given and not None, remove characters in chars instead.
12476[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012477
12478static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012479unicode_strip_impl(PyObject *self, PyObject *chars)
Victor Stinner0c4a8282017-01-17 02:21:47 +010012480/*[clinic end generated code: output=ca19018454345d57 input=eefe24a1059c352b]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012481{
INADA Naoki3ae20562017-01-16 20:41:20 +090012482 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012483}
12484
12485
INADA Naoki3ae20562017-01-16 20:41:20 +090012486/*[clinic input]
12487str.lstrip as unicode_lstrip
12488
12489 chars: object = NULL
12490 /
12491
12492Return a copy of the string with leading whitespace removed.
12493
12494If chars is given and not None, remove characters in chars instead.
12495[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012496
12497static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012498unicode_lstrip_impl(PyObject *self, PyObject *chars)
12499/*[clinic end generated code: output=3b43683251f79ca7 input=9e56f3c45f5ff4c3]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012500{
INADA Naoki3ae20562017-01-16 20:41:20 +090012501 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012502}
12503
12504
INADA Naoki3ae20562017-01-16 20:41:20 +090012505/*[clinic input]
12506str.rstrip as unicode_rstrip
12507
12508 chars: object = NULL
12509 /
12510
12511Return a copy of the string with trailing whitespace removed.
12512
12513If chars is given and not None, remove characters in chars instead.
12514[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012515
12516static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012517unicode_rstrip_impl(PyObject *self, PyObject *chars)
12518/*[clinic end generated code: output=4a59230017cc3b7a input=ac89d0219cb411ee]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012519{
INADA Naoki3ae20562017-01-16 20:41:20 +090012520 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012521}
12522
12523
Guido van Rossumd57fd912000-03-10 22:53:23 +000012524static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012525unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012526{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012527 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012528 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012529
Serhiy Storchaka05997252013-01-26 12:14:02 +020012530 if (len < 1)
12531 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012532
Victor Stinnerc4b49542011-12-11 22:44:26 +010012533 /* no repeat, return original string */
12534 if (len == 1)
12535 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012536
Benjamin Petersonbac79492012-01-14 13:34:47 -050012537 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012538 return NULL;
12539
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012540 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012541 PyErr_SetString(PyExc_OverflowError,
12542 "repeated string is too long");
12543 return NULL;
12544 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012545 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012546
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012547 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012548 if (!u)
12549 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012550 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012551
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012552 if (PyUnicode_GET_LENGTH(str) == 1) {
12553 const int kind = PyUnicode_KIND(str);
12554 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012555 if (kind == PyUnicode_1BYTE_KIND) {
12556 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012557 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012558 }
12559 else if (kind == PyUnicode_2BYTE_KIND) {
12560 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012561 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012562 ucs2[n] = fill_char;
12563 } else {
12564 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12565 assert(kind == PyUnicode_4BYTE_KIND);
12566 for (n = 0; n < len; ++n)
12567 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012568 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012569 }
12570 else {
12571 /* number of characters copied this far */
12572 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012573 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012574 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012575 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012576 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012577 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012578 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012579 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012580 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012581 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012582 }
12583
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012584 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012585 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012586}
12587
Alexander Belopolsky40018472011-02-26 01:02:56 +000012588PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012589PyUnicode_Replace(PyObject *str,
12590 PyObject *substr,
12591 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012592 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012593{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012594 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12595 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012596 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012597 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012598}
12599
INADA Naoki3ae20562017-01-16 20:41:20 +090012600/*[clinic input]
12601str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012602
INADA Naoki3ae20562017-01-16 20:41:20 +090012603 old: unicode
12604 new: unicode
12605 count: Py_ssize_t = -1
12606 Maximum number of occurrences to replace.
12607 -1 (the default value) means replace all occurrences.
12608 /
12609
12610Return a copy with all occurrences of substring old replaced by new.
12611
12612If the optional argument count is given, only the first count occurrences are
12613replaced.
12614[clinic start generated code]*/
12615
12616static PyObject *
12617unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12618 Py_ssize_t count)
12619/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012620{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012621 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012622 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012623 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012624}
12625
Alexander Belopolsky40018472011-02-26 01:02:56 +000012626static PyObject *
12627unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012628{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012629 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012630 Py_ssize_t isize;
12631 Py_ssize_t osize, squote, dquote, i, o;
12632 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012633 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012634 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012635
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012636 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012637 return NULL;
12638
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012639 isize = PyUnicode_GET_LENGTH(unicode);
12640 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012641
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012642 /* Compute length of output, quote characters, and
12643 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012644 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012645 max = 127;
12646 squote = dquote = 0;
12647 ikind = PyUnicode_KIND(unicode);
12648 for (i = 0; i < isize; i++) {
12649 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012650 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012651 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012652 case '\'': squote++; break;
12653 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012654 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012655 incr = 2;
12656 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012657 default:
12658 /* Fast-path ASCII */
12659 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012660 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012661 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012662 ;
12663 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012664 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012665 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012666 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012667 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012668 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012669 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012670 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012671 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012672 if (osize > PY_SSIZE_T_MAX - incr) {
12673 PyErr_SetString(PyExc_OverflowError,
12674 "string is too long to generate repr");
12675 return NULL;
12676 }
12677 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012678 }
12679
12680 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012681 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012682 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012683 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012684 if (dquote)
12685 /* Both squote and dquote present. Use squote,
12686 and escape them */
12687 osize += squote;
12688 else
12689 quote = '"';
12690 }
Victor Stinner55c08782013-04-14 18:45:39 +020012691 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012692
12693 repr = PyUnicode_New(osize, max);
12694 if (repr == NULL)
12695 return NULL;
12696 okind = PyUnicode_KIND(repr);
12697 odata = PyUnicode_DATA(repr);
12698
12699 PyUnicode_WRITE(okind, odata, 0, quote);
12700 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012701 if (unchanged) {
12702 _PyUnicode_FastCopyCharacters(repr, 1,
12703 unicode, 0,
12704 isize);
12705 }
12706 else {
12707 for (i = 0, o = 1; i < isize; i++) {
12708 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012709
Victor Stinner55c08782013-04-14 18:45:39 +020012710 /* Escape quotes and backslashes */
12711 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012712 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012713 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012714 continue;
12715 }
12716
12717 /* Map special whitespace to '\t', \n', '\r' */
12718 if (ch == '\t') {
12719 PyUnicode_WRITE(okind, odata, o++, '\\');
12720 PyUnicode_WRITE(okind, odata, o++, 't');
12721 }
12722 else if (ch == '\n') {
12723 PyUnicode_WRITE(okind, odata, o++, '\\');
12724 PyUnicode_WRITE(okind, odata, o++, 'n');
12725 }
12726 else if (ch == '\r') {
12727 PyUnicode_WRITE(okind, odata, o++, '\\');
12728 PyUnicode_WRITE(okind, odata, o++, 'r');
12729 }
12730
12731 /* Map non-printable US ASCII to '\xhh' */
12732 else if (ch < ' ' || ch == 0x7F) {
12733 PyUnicode_WRITE(okind, odata, o++, '\\');
12734 PyUnicode_WRITE(okind, odata, o++, 'x');
12735 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12736 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12737 }
12738
12739 /* Copy ASCII characters as-is */
12740 else if (ch < 0x7F) {
12741 PyUnicode_WRITE(okind, odata, o++, ch);
12742 }
12743
12744 /* Non-ASCII characters */
12745 else {
12746 /* Map Unicode whitespace and control characters
12747 (categories Z* and C* except ASCII space)
12748 */
12749 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12750 PyUnicode_WRITE(okind, odata, o++, '\\');
12751 /* Map 8-bit characters to '\xhh' */
12752 if (ch <= 0xff) {
12753 PyUnicode_WRITE(okind, odata, o++, 'x');
12754 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12755 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12756 }
12757 /* Map 16-bit characters to '\uxxxx' */
12758 else if (ch <= 0xffff) {
12759 PyUnicode_WRITE(okind, odata, o++, 'u');
12760 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12761 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12762 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12763 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12764 }
12765 /* Map 21-bit characters to '\U00xxxxxx' */
12766 else {
12767 PyUnicode_WRITE(okind, odata, o++, 'U');
12768 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12769 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12770 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12771 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12772 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12773 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12774 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12775 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12776 }
12777 }
12778 /* Copy characters as-is */
12779 else {
12780 PyUnicode_WRITE(okind, odata, o++, ch);
12781 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012782 }
12783 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012784 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012785 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012786 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012787 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012788}
12789
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012790PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012791 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012792\n\
12793Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012794such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012795arguments start and end are interpreted as in slice notation.\n\
12796\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012797Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012798
12799static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012800unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012801{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012802 /* initialize variables to prevent gcc warning */
12803 PyObject *substring = NULL;
12804 Py_ssize_t start = 0;
12805 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012806 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012807
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012808 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012809 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012810
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012811 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012812 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012813
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012814 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012815
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 if (result == -2)
12817 return NULL;
12818
Christian Heimes217cfd12007-12-02 14:31:20 +000012819 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012820}
12821
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012822PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012823 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012824\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012825Return the highest index in S where substring sub is found,\n\
12826such that sub is contained within S[start:end]. Optional\n\
12827arguments start and end are interpreted as in slice notation.\n\
12828\n\
12829Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012830
12831static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012832unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012833{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012834 /* initialize variables to prevent gcc warning */
12835 PyObject *substring = NULL;
12836 Py_ssize_t start = 0;
12837 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012838 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012839
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012840 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012841 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012842
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012843 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012844 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012845
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012846 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012847
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012848 if (result == -2)
12849 return NULL;
12850
Guido van Rossumd57fd912000-03-10 22:53:23 +000012851 if (result < 0) {
12852 PyErr_SetString(PyExc_ValueError, "substring not found");
12853 return NULL;
12854 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012855
Christian Heimes217cfd12007-12-02 14:31:20 +000012856 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012857}
12858
INADA Naoki3ae20562017-01-16 20:41:20 +090012859/*[clinic input]
12860str.rjust as unicode_rjust
12861
12862 width: Py_ssize_t
12863 fillchar: Py_UCS4 = ' '
12864 /
12865
12866Return a right-justified string of length width.
12867
12868Padding is done using the specified fill character (default is a space).
12869[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012870
12871static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012872unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12873/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012874{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012875 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012876 return NULL;
12877
Victor Stinnerc4b49542011-12-11 22:44:26 +010012878 if (PyUnicode_GET_LENGTH(self) >= width)
12879 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012880
Victor Stinnerc4b49542011-12-11 22:44:26 +010012881 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012882}
12883
Alexander Belopolsky40018472011-02-26 01:02:56 +000012884PyObject *
12885PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012886{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012887 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012888 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012889
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012890 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012891}
12892
INADA Naoki3ae20562017-01-16 20:41:20 +090012893/*[clinic input]
12894str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000012895
INADA Naoki3ae20562017-01-16 20:41:20 +090012896 sep: object = None
12897 The delimiter according which to split the string.
12898 None (the default value) means split according to any whitespace,
12899 and discard empty strings from the result.
12900 maxsplit: Py_ssize_t = -1
12901 Maximum number of splits to do.
12902 -1 (the default value) means no limit.
12903
12904Return a list of the words in the string, using sep as the delimiter string.
12905[clinic start generated code]*/
12906
12907static PyObject *
12908unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
12909/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012910{
INADA Naoki3ae20562017-01-16 20:41:20 +090012911 if (sep == Py_None)
12912 return split(self, NULL, maxsplit);
12913 if (PyUnicode_Check(sep))
12914 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012915
Victor Stinner998b8062018-09-12 00:23:25 +020012916 PyErr_Format(PyExc_TypeError,
12917 "must be str or None, not %.100s",
12918 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012919 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012920}
12921
Thomas Wouters477c8d52006-05-27 19:21:47 +000012922PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012923PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012924{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012925 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012926 int kind1, kind2;
12927 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012928 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012929
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012930 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012931 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012932
Victor Stinner14f8f022011-10-05 20:58:25 +020012933 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012934 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012935 len1 = PyUnicode_GET_LENGTH(str_obj);
12936 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012937 if (kind1 < kind2 || len1 < len2) {
12938 _Py_INCREF_UNICODE_EMPTY();
12939 if (!unicode_empty)
12940 out = NULL;
12941 else {
12942 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12943 Py_DECREF(unicode_empty);
12944 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012945 return out;
12946 }
12947 buf1 = PyUnicode_DATA(str_obj);
12948 buf2 = PyUnicode_DATA(sep_obj);
12949 if (kind2 != kind1) {
12950 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12951 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012952 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012953 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012955 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012956 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020012957 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12958 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12959 else
12960 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012961 break;
12962 case PyUnicode_2BYTE_KIND:
12963 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12964 break;
12965 case PyUnicode_4BYTE_KIND:
12966 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12967 break;
12968 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070012969 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000012971
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012972 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012973 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012974
12975 return out;
12976}
12977
12978
12979PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012980PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000012981{
Thomas Wouters477c8d52006-05-27 19:21:47 +000012982 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012983 int kind1, kind2;
12984 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012985 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012986
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012987 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012988 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012989
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012990 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012991 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012992 len1 = PyUnicode_GET_LENGTH(str_obj);
12993 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020012994 if (kind1 < kind2 || len1 < len2) {
12995 _Py_INCREF_UNICODE_EMPTY();
12996 if (!unicode_empty)
12997 out = NULL;
12998 else {
12999 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13000 Py_DECREF(unicode_empty);
13001 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013002 return out;
13003 }
13004 buf1 = PyUnicode_DATA(str_obj);
13005 buf2 = PyUnicode_DATA(sep_obj);
13006 if (kind2 != kind1) {
13007 buf2 = _PyUnicode_AsKind(sep_obj, kind1);
13008 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013009 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013010 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013011
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013012 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013013 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013014 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13015 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13016 else
13017 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013018 break;
13019 case PyUnicode_2BYTE_KIND:
13020 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13021 break;
13022 case PyUnicode_4BYTE_KIND:
13023 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13024 break;
13025 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013026 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013027 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013028
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013029 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013030 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013031
13032 return out;
13033}
13034
INADA Naoki3ae20562017-01-16 20:41:20 +090013035/*[clinic input]
13036str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013037
INADA Naoki3ae20562017-01-16 20:41:20 +090013038 sep: object
13039 /
13040
13041Partition the string into three parts using the given separator.
13042
13043This will search for the separator in the string. If the separator is found,
13044returns a 3-tuple containing the part before the separator, the separator
13045itself, and the part after it.
13046
13047If the separator is not found, returns a 3-tuple containing the original string
13048and two empty strings.
13049[clinic start generated code]*/
13050
13051static PyObject *
13052unicode_partition(PyObject *self, PyObject *sep)
13053/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013054{
INADA Naoki3ae20562017-01-16 20:41:20 +090013055 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013056}
13057
INADA Naoki3ae20562017-01-16 20:41:20 +090013058/*[clinic input]
13059str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060
INADA Naoki3ae20562017-01-16 20:41:20 +090013061Partition the string into three parts using the given separator.
13062
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013063This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013064the separator is found, returns a 3-tuple containing the part before the
13065separator, the separator itself, and the part after it.
13066
13067If the separator is not found, returns a 3-tuple containing two empty strings
13068and the original string.
13069[clinic start generated code]*/
13070
13071static PyObject *
13072unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013073/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013074{
INADA Naoki3ae20562017-01-16 20:41:20 +090013075 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013076}
13077
Alexander Belopolsky40018472011-02-26 01:02:56 +000013078PyObject *
13079PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013080{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013081 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013082 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013083
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013084 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013085}
13086
INADA Naoki3ae20562017-01-16 20:41:20 +090013087/*[clinic input]
13088str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013089
INADA Naoki3ae20562017-01-16 20:41:20 +090013090Return a list of the words in the string, using sep as the delimiter string.
13091
13092Splits are done starting at the end of the string and working to the front.
13093[clinic start generated code]*/
13094
13095static PyObject *
13096unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13097/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013098{
INADA Naoki3ae20562017-01-16 20:41:20 +090013099 if (sep == Py_None)
13100 return rsplit(self, NULL, maxsplit);
13101 if (PyUnicode_Check(sep))
13102 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013103
Victor Stinner998b8062018-09-12 00:23:25 +020013104 PyErr_Format(PyExc_TypeError,
13105 "must be str or None, not %.100s",
13106 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013107 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013108}
13109
INADA Naoki3ae20562017-01-16 20:41:20 +090013110/*[clinic input]
13111str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013112
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013113 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013114
13115Return a list of the lines in the string, breaking at line boundaries.
13116
13117Line breaks are not included in the resulting list unless keepends is given and
13118true.
13119[clinic start generated code]*/
13120
13121static PyObject *
13122unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013123/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013124{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013125 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013126}
13127
13128static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013129PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013130{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013131 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013132}
13133
INADA Naoki3ae20562017-01-16 20:41:20 +090013134/*[clinic input]
13135str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013136
INADA Naoki3ae20562017-01-16 20:41:20 +090013137Convert uppercase characters to lowercase and lowercase characters to uppercase.
13138[clinic start generated code]*/
13139
13140static PyObject *
13141unicode_swapcase_impl(PyObject *self)
13142/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013143{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013144 if (PyUnicode_READY(self) == -1)
13145 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013146 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013147}
13148
Larry Hastings61272b72014-01-07 12:41:53 -080013149/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013150
Larry Hastings31826802013-10-19 00:09:25 -070013151@staticmethod
13152str.maketrans as unicode_maketrans
13153
13154 x: object
13155
13156 y: unicode=NULL
13157
13158 z: unicode=NULL
13159
13160 /
13161
13162Return a translation table usable for str.translate().
13163
13164If there is only one argument, it must be a dictionary mapping Unicode
13165ordinals (integers) or characters to Unicode ordinals, strings or None.
13166Character keys will be then converted to ordinals.
13167If there are two arguments, they must be strings of equal length, and
13168in the resulting dictionary, each character in x will be mapped to the
13169character at the same position in y. If there is a third argument, it
13170must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013171[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013172
Larry Hastings31826802013-10-19 00:09:25 -070013173static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013174unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013175/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013176{
Georg Brandlceee0772007-11-27 23:48:05 +000013177 PyObject *new = NULL, *key, *value;
13178 Py_ssize_t i = 0;
13179 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013180
Georg Brandlceee0772007-11-27 23:48:05 +000013181 new = PyDict_New();
13182 if (!new)
13183 return NULL;
13184 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013185 int x_kind, y_kind, z_kind;
13186 void *x_data, *y_data, *z_data;
13187
Georg Brandlceee0772007-11-27 23:48:05 +000013188 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013189 if (!PyUnicode_Check(x)) {
13190 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13191 "be a string if there is a second argument");
13192 goto err;
13193 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013194 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013195 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13196 "arguments must have equal length");
13197 goto err;
13198 }
13199 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013200 x_kind = PyUnicode_KIND(x);
13201 y_kind = PyUnicode_KIND(y);
13202 x_data = PyUnicode_DATA(x);
13203 y_data = PyUnicode_DATA(y);
13204 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13205 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013206 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013207 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013208 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013209 if (!value) {
13210 Py_DECREF(key);
13211 goto err;
13212 }
Georg Brandlceee0772007-11-27 23:48:05 +000013213 res = PyDict_SetItem(new, key, value);
13214 Py_DECREF(key);
13215 Py_DECREF(value);
13216 if (res < 0)
13217 goto err;
13218 }
13219 /* create entries for deleting chars in z */
13220 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013221 z_kind = PyUnicode_KIND(z);
13222 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013223 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013224 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013225 if (!key)
13226 goto err;
13227 res = PyDict_SetItem(new, key, Py_None);
13228 Py_DECREF(key);
13229 if (res < 0)
13230 goto err;
13231 }
13232 }
13233 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013234 int kind;
13235 void *data;
13236
Georg Brandlceee0772007-11-27 23:48:05 +000013237 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013238 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013239 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13240 "to maketrans it must be a dict");
13241 goto err;
13242 }
13243 /* copy entries into the new dict, converting string keys to int keys */
13244 while (PyDict_Next(x, &i, &key, &value)) {
13245 if (PyUnicode_Check(key)) {
13246 /* convert string keys to integer keys */
13247 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013248 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013249 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13250 "table must be of length 1");
13251 goto err;
13252 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013253 kind = PyUnicode_KIND(key);
13254 data = PyUnicode_DATA(key);
13255 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013256 if (!newkey)
13257 goto err;
13258 res = PyDict_SetItem(new, newkey, value);
13259 Py_DECREF(newkey);
13260 if (res < 0)
13261 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013262 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013263 /* just keep integer keys */
13264 if (PyDict_SetItem(new, key, value) < 0)
13265 goto err;
13266 } else {
13267 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13268 "be strings or integers");
13269 goto err;
13270 }
13271 }
13272 }
13273 return new;
13274 err:
13275 Py_DECREF(new);
13276 return NULL;
13277}
13278
INADA Naoki3ae20562017-01-16 20:41:20 +090013279/*[clinic input]
13280str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281
INADA Naoki3ae20562017-01-16 20:41:20 +090013282 table: object
13283 Translation table, which must be a mapping of Unicode ordinals to
13284 Unicode ordinals, strings, or None.
13285 /
13286
13287Replace each character in the string using the given translation table.
13288
13289The table must implement lookup/indexing via __getitem__, for instance a
13290dictionary or list. If this operation raises LookupError, the character is
13291left untouched. Characters mapped to None are deleted.
13292[clinic start generated code]*/
13293
13294static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013295unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013296/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013297{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013298 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013299}
13300
INADA Naoki3ae20562017-01-16 20:41:20 +090013301/*[clinic input]
13302str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013303
INADA Naoki3ae20562017-01-16 20:41:20 +090013304Return a copy of the string converted to uppercase.
13305[clinic start generated code]*/
13306
13307static PyObject *
13308unicode_upper_impl(PyObject *self)
13309/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013310{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013311 if (PyUnicode_READY(self) == -1)
13312 return NULL;
13313 if (PyUnicode_IS_ASCII(self))
13314 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013315 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013316}
13317
INADA Naoki3ae20562017-01-16 20:41:20 +090013318/*[clinic input]
13319str.zfill as unicode_zfill
13320
13321 width: Py_ssize_t
13322 /
13323
13324Pad a numeric string with zeros on the left, to fill a field of the given width.
13325
13326The string is never truncated.
13327[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013328
13329static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013330unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013331/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013332{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013333 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013334 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013335 int kind;
13336 void *data;
13337 Py_UCS4 chr;
13338
Benjamin Petersonbac79492012-01-14 13:34:47 -050013339 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013340 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013341
Victor Stinnerc4b49542011-12-11 22:44:26 +010013342 if (PyUnicode_GET_LENGTH(self) >= width)
13343 return unicode_result_unchanged(self);
13344
13345 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013346
13347 u = pad(self, fill, 0, '0');
13348
Walter Dörwald068325e2002-04-15 13:36:47 +000013349 if (u == NULL)
13350 return NULL;
13351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013352 kind = PyUnicode_KIND(u);
13353 data = PyUnicode_DATA(u);
13354 chr = PyUnicode_READ(kind, data, fill);
13355
13356 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013357 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013358 PyUnicode_WRITE(kind, data, 0, chr);
13359 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013360 }
13361
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013362 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013363 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013364}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013365
13366#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013367static PyObject *
13368unicode__decimal2ascii(PyObject *self)
13369{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013370 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013371}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013372#endif
13373
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013374PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013375 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013376\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013377Return True if S starts with the specified prefix, False otherwise.\n\
13378With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013379With optional end, stop comparing S at that position.\n\
13380prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013381
13382static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013383unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013384 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013385{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013386 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013387 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013388 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013389 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013390 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013391
Jesus Ceaac451502011-04-20 17:09:23 +020013392 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013393 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013394 if (PyTuple_Check(subobj)) {
13395 Py_ssize_t i;
13396 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013397 substring = PyTuple_GET_ITEM(subobj, i);
13398 if (!PyUnicode_Check(substring)) {
13399 PyErr_Format(PyExc_TypeError,
13400 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013401 "not %.100s",
13402 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013403 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013404 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013405 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013406 if (result == -1)
13407 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013408 if (result) {
13409 Py_RETURN_TRUE;
13410 }
13411 }
13412 /* nothing matched */
13413 Py_RETURN_FALSE;
13414 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013415 if (!PyUnicode_Check(subobj)) {
13416 PyErr_Format(PyExc_TypeError,
13417 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013418 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013419 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013420 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013421 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013422 if (result == -1)
13423 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013424 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013425}
13426
13427
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013428PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013429 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013430\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013431Return True if S ends with the specified suffix, False otherwise.\n\
13432With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013433With optional end, stop comparing S at that position.\n\
13434suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013435
13436static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013437unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013438 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013439{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013440 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013441 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013442 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013443 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013444 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013445
Jesus Ceaac451502011-04-20 17:09:23 +020013446 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013447 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013448 if (PyTuple_Check(subobj)) {
13449 Py_ssize_t i;
13450 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013451 substring = PyTuple_GET_ITEM(subobj, i);
13452 if (!PyUnicode_Check(substring)) {
13453 PyErr_Format(PyExc_TypeError,
13454 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013455 "not %.100s",
13456 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013457 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013458 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013459 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013460 if (result == -1)
13461 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013462 if (result) {
13463 Py_RETURN_TRUE;
13464 }
13465 }
13466 Py_RETURN_FALSE;
13467 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013468 if (!PyUnicode_Check(subobj)) {
13469 PyErr_Format(PyExc_TypeError,
13470 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013471 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013472 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013473 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013474 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013475 if (result == -1)
13476 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013477 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013478}
13479
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013480static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013481_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013482{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013483 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13484 writer->data = PyUnicode_DATA(writer->buffer);
13485
13486 if (!writer->readonly) {
13487 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013488 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013489 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013490 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013491 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13492 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13493 writer->kind = PyUnicode_WCHAR_KIND;
13494 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13495
Victor Stinner8f674cc2013-04-17 23:02:17 +020013496 /* Copy-on-write mode: set buffer size to 0 so
13497 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13498 * next write. */
13499 writer->size = 0;
13500 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013501}
13502
Victor Stinnerd3f08822012-05-29 12:57:52 +020013503void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013504_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013505{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013506 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013507
13508 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013509 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013510
13511 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13512 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13513 writer->kind = PyUnicode_WCHAR_KIND;
13514 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013515}
13516
Victor Stinnerd3f08822012-05-29 12:57:52 +020013517int
13518_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13519 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013520{
13521 Py_ssize_t newlen;
13522 PyObject *newbuffer;
13523
Victor Stinner2740e462016-09-06 16:58:36 -070013524 assert(maxchar <= MAX_UNICODE);
13525
Victor Stinnerca9381e2015-09-22 00:58:32 +020013526 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013527 assert((maxchar > writer->maxchar && length >= 0)
13528 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013529
Victor Stinner202fdca2012-05-07 12:47:02 +020013530 if (length > PY_SSIZE_T_MAX - writer->pos) {
13531 PyErr_NoMemory();
13532 return -1;
13533 }
13534 newlen = writer->pos + length;
13535
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013536 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013537
Victor Stinnerd3f08822012-05-29 12:57:52 +020013538 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013539 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013540 if (writer->overallocate
13541 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13542 /* overallocate to limit the number of realloc() */
13543 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013544 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013545 if (newlen < writer->min_length)
13546 newlen = writer->min_length;
13547
Victor Stinnerd3f08822012-05-29 12:57:52 +020013548 writer->buffer = PyUnicode_New(newlen, maxchar);
13549 if (writer->buffer == NULL)
13550 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013551 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013552 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013553 if (writer->overallocate
13554 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13555 /* overallocate to limit the number of realloc() */
13556 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013557 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013558 if (newlen < writer->min_length)
13559 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013560
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013561 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013562 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013563 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013564 newbuffer = PyUnicode_New(newlen, maxchar);
13565 if (newbuffer == NULL)
13566 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013567 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13568 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013569 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013570 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013571 }
13572 else {
13573 newbuffer = resize_compact(writer->buffer, newlen);
13574 if (newbuffer == NULL)
13575 return -1;
13576 }
13577 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013578 }
13579 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013580 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013581 newbuffer = PyUnicode_New(writer->size, maxchar);
13582 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013583 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013584 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13585 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013586 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013587 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013588 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013589 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013590
13591#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013592}
13593
Victor Stinnerca9381e2015-09-22 00:58:32 +020013594int
13595_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13596 enum PyUnicode_Kind kind)
13597{
13598 Py_UCS4 maxchar;
13599
13600 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13601 assert(writer->kind < kind);
13602
13603 switch (kind)
13604 {
13605 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13606 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13607 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13608 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013609 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013610 }
13611
13612 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13613}
13614
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013615static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013616_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013617{
Victor Stinner2740e462016-09-06 16:58:36 -070013618 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013619 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13620 return -1;
13621 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13622 writer->pos++;
13623 return 0;
13624}
13625
13626int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013627_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13628{
13629 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13630}
13631
13632int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013633_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13634{
13635 Py_UCS4 maxchar;
13636 Py_ssize_t len;
13637
13638 if (PyUnicode_READY(str) == -1)
13639 return -1;
13640 len = PyUnicode_GET_LENGTH(str);
13641 if (len == 0)
13642 return 0;
13643 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13644 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013645 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013646 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013647 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013648 Py_INCREF(str);
13649 writer->buffer = str;
13650 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013651 writer->pos += len;
13652 return 0;
13653 }
13654 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13655 return -1;
13656 }
13657 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13658 str, 0, len);
13659 writer->pos += len;
13660 return 0;
13661}
13662
Victor Stinnere215d962012-10-06 23:03:36 +020013663int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013664_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13665 Py_ssize_t start, Py_ssize_t end)
13666{
13667 Py_UCS4 maxchar;
13668 Py_ssize_t len;
13669
13670 if (PyUnicode_READY(str) == -1)
13671 return -1;
13672
13673 assert(0 <= start);
13674 assert(end <= PyUnicode_GET_LENGTH(str));
13675 assert(start <= end);
13676
13677 if (end == 0)
13678 return 0;
13679
13680 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13681 return _PyUnicodeWriter_WriteStr(writer, str);
13682
13683 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13684 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13685 else
13686 maxchar = writer->maxchar;
13687 len = end - start;
13688
13689 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13690 return -1;
13691
13692 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13693 str, start, len);
13694 writer->pos += len;
13695 return 0;
13696}
13697
13698int
Victor Stinner4a587072013-11-19 12:54:53 +010013699_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13700 const char *ascii, Py_ssize_t len)
13701{
13702 if (len == -1)
13703 len = strlen(ascii);
13704
13705 assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13706
13707 if (writer->buffer == NULL && !writer->overallocate) {
13708 PyObject *str;
13709
13710 str = _PyUnicode_FromASCII(ascii, len);
13711 if (str == NULL)
13712 return -1;
13713
13714 writer->readonly = 1;
13715 writer->buffer = str;
13716 _PyUnicodeWriter_Update(writer);
13717 writer->pos += len;
13718 return 0;
13719 }
13720
13721 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13722 return -1;
13723
13724 switch (writer->kind)
13725 {
13726 case PyUnicode_1BYTE_KIND:
13727 {
13728 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13729 Py_UCS1 *data = writer->data;
13730
Christian Heimesf051e432016-09-13 20:22:02 +020013731 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013732 break;
13733 }
13734 case PyUnicode_2BYTE_KIND:
13735 {
13736 _PyUnicode_CONVERT_BYTES(
13737 Py_UCS1, Py_UCS2,
13738 ascii, ascii + len,
13739 (Py_UCS2 *)writer->data + writer->pos);
13740 break;
13741 }
13742 case PyUnicode_4BYTE_KIND:
13743 {
13744 _PyUnicode_CONVERT_BYTES(
13745 Py_UCS1, Py_UCS4,
13746 ascii, ascii + len,
13747 (Py_UCS4 *)writer->data + writer->pos);
13748 break;
13749 }
13750 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013751 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013752 }
13753
13754 writer->pos += len;
13755 return 0;
13756}
13757
13758int
13759_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13760 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013761{
13762 Py_UCS4 maxchar;
13763
13764 maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13765 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13766 return -1;
13767 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13768 writer->pos += len;
13769 return 0;
13770}
13771
Victor Stinnerd3f08822012-05-29 12:57:52 +020013772PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013773_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013774{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013775 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013776
Victor Stinnerd3f08822012-05-29 12:57:52 +020013777 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013778 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013779 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013780 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013781
13782 str = writer->buffer;
13783 writer->buffer = NULL;
13784
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013785 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013786 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13787 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013788 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013789
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013790 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13791 PyObject *str2;
13792 str2 = resize_compact(str, writer->pos);
13793 if (str2 == NULL) {
13794 Py_DECREF(str);
13795 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013796 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013797 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013798 }
13799
Victor Stinner15a0bd32013-07-08 22:29:55 +020013800 assert(_PyUnicode_CheckConsistency(str, 1));
13801 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013802}
13803
Victor Stinnerd3f08822012-05-29 12:57:52 +020013804void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013805_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013806{
13807 Py_CLEAR(writer->buffer);
13808}
13809
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013810#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013811
13812PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013813 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013814\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013815Return a formatted version of S, using substitutions from args and kwargs.\n\
13816The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013817
Eric Smith27bbca62010-11-04 17:06:58 +000013818PyDoc_STRVAR(format_map__doc__,
13819 "S.format_map(mapping) -> str\n\
13820\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013821Return a formatted version of S, using substitutions from mapping.\n\
13822The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013823
INADA Naoki3ae20562017-01-16 20:41:20 +090013824/*[clinic input]
13825str.__format__ as unicode___format__
13826
13827 format_spec: unicode
13828 /
13829
13830Return a formatted version of the string as described by format_spec.
13831[clinic start generated code]*/
13832
Eric Smith4a7d76d2008-05-30 18:10:19 +000013833static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013834unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013835/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013836{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013837 _PyUnicodeWriter writer;
13838 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013839
Victor Stinnerd3f08822012-05-29 12:57:52 +020013840 if (PyUnicode_READY(self) == -1)
13841 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013842 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013843 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13844 self, format_spec, 0,
13845 PyUnicode_GET_LENGTH(format_spec));
13846 if (ret == -1) {
13847 _PyUnicodeWriter_Dealloc(&writer);
13848 return NULL;
13849 }
13850 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013851}
13852
INADA Naoki3ae20562017-01-16 20:41:20 +090013853/*[clinic input]
13854str.__sizeof__ as unicode_sizeof
13855
13856Return the size of the string in memory, in bytes.
13857[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000013858
13859static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013860unicode_sizeof_impl(PyObject *self)
13861/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013862{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013863 Py_ssize_t size;
13864
13865 /* If it's a compact object, account for base structure +
13866 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013867 if (PyUnicode_IS_COMPACT_ASCII(self))
13868 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
13869 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013870 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090013871 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013872 else {
13873 /* If it is a two-block object, account for base object, and
13874 for character block if present. */
13875 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090013876 if (_PyUnicode_DATA_ANY(self))
13877 size += (PyUnicode_GET_LENGTH(self) + 1) *
13878 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013879 }
13880 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020013881 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090013882 if (_PyUnicode_HAS_WSTR_MEMORY(self))
13883 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
13884 if (_PyUnicode_HAS_UTF8_MEMORY(self))
13885 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013886
13887 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013888}
13889
Georg Brandlc28e1fa2008-06-10 19:20:26 +000013890static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013891unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013892{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010013893 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013894 if (!copy)
13895 return NULL;
13896 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000013897}
13898
Guido van Rossumd57fd912000-03-10 22:53:23 +000013899static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090013900 UNICODE_ENCODE_METHODDEF
13901 UNICODE_REPLACE_METHODDEF
13902 UNICODE_SPLIT_METHODDEF
13903 UNICODE_RSPLIT_METHODDEF
13904 UNICODE_JOIN_METHODDEF
13905 UNICODE_CAPITALIZE_METHODDEF
13906 UNICODE_CASEFOLD_METHODDEF
13907 UNICODE_TITLE_METHODDEF
13908 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013909 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013910 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013911 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013912 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013913 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013914 UNICODE_LJUST_METHODDEF
13915 UNICODE_LOWER_METHODDEF
13916 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013917 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13918 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013919 UNICODE_RJUST_METHODDEF
13920 UNICODE_RSTRIP_METHODDEF
13921 UNICODE_RPARTITION_METHODDEF
13922 UNICODE_SPLITLINES_METHODDEF
13923 UNICODE_STRIP_METHODDEF
13924 UNICODE_SWAPCASE_METHODDEF
13925 UNICODE_TRANSLATE_METHODDEF
13926 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000013927 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13928 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090013929 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013930 UNICODE_ISLOWER_METHODDEF
13931 UNICODE_ISUPPER_METHODDEF
13932 UNICODE_ISTITLE_METHODDEF
13933 UNICODE_ISSPACE_METHODDEF
13934 UNICODE_ISDECIMAL_METHODDEF
13935 UNICODE_ISDIGIT_METHODDEF
13936 UNICODE_ISNUMERIC_METHODDEF
13937 UNICODE_ISALPHA_METHODDEF
13938 UNICODE_ISALNUM_METHODDEF
13939 UNICODE_ISIDENTIFIER_METHODDEF
13940 UNICODE_ISPRINTABLE_METHODDEF
13941 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020013942 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000013943 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090013944 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070013945 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090013946 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000013947#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013948 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013949 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013950#endif
13951
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053013952 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000013953 {NULL, NULL}
13954};
13955
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013956static PyObject *
13957unicode_mod(PyObject *v, PyObject *w)
13958{
Brian Curtindfc80e32011-08-10 20:28:54 -050013959 if (!PyUnicode_Check(v))
13960 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000013961 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013962}
13963
13964static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013965 0, /*nb_add*/
13966 0, /*nb_subtract*/
13967 0, /*nb_multiply*/
13968 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000013969};
13970
Guido van Rossumd57fd912000-03-10 22:53:23 +000013971static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000013972 (lenfunc) unicode_length, /* sq_length */
13973 PyUnicode_Concat, /* sq_concat */
13974 (ssizeargfunc) unicode_repeat, /* sq_repeat */
13975 (ssizeargfunc) unicode_getitem, /* sq_item */
13976 0, /* sq_slice */
13977 0, /* sq_ass_item */
13978 0, /* sq_ass_slice */
13979 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000013980};
13981
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013982static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013983unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013984{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013985 if (PyUnicode_READY(self) == -1)
13986 return NULL;
13987
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000013988 if (PyIndex_Check(item)) {
13989 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013990 if (i == -1 && PyErr_Occurred())
13991 return NULL;
13992 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013993 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013994 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000013995 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060013996 Py_ssize_t start, stop, step, slicelength, i;
13997 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020013998 PyObject *result;
13999 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014000 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014001 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014002
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014003 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014004 return NULL;
14005 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014006 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14007 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014008
14009 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014010 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014011 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014012 slicelength == PyUnicode_GET_LENGTH(self)) {
14013 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014014 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014015 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014016 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014017 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014018 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014019 src_kind = PyUnicode_KIND(self);
14020 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014021 if (!PyUnicode_IS_ASCII(self)) {
14022 kind_limit = kind_maxchar_limit(src_kind);
14023 max_char = 0;
14024 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14025 ch = PyUnicode_READ(src_kind, src_data, cur);
14026 if (ch > max_char) {
14027 max_char = ch;
14028 if (max_char >= kind_limit)
14029 break;
14030 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014031 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014032 }
Victor Stinner55c99112011-10-13 01:17:06 +020014033 else
14034 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014035 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014036 if (result == NULL)
14037 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014038 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014039 dest_data = PyUnicode_DATA(result);
14040
14041 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014042 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14043 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014044 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014045 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014046 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014047 } else {
14048 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14049 return NULL;
14050 }
14051}
14052
14053static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014054 (lenfunc)unicode_length, /* mp_length */
14055 (binaryfunc)unicode_subscript, /* mp_subscript */
14056 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014057};
14058
Guido van Rossumd57fd912000-03-10 22:53:23 +000014059
Guido van Rossumd57fd912000-03-10 22:53:23 +000014060/* Helpers for PyUnicode_Format() */
14061
Victor Stinnera47082312012-10-04 02:19:54 +020014062struct unicode_formatter_t {
14063 PyObject *args;
14064 int args_owned;
14065 Py_ssize_t arglen, argidx;
14066 PyObject *dict;
14067
14068 enum PyUnicode_Kind fmtkind;
14069 Py_ssize_t fmtcnt, fmtpos;
14070 void *fmtdata;
14071 PyObject *fmtstr;
14072
14073 _PyUnicodeWriter writer;
14074};
14075
14076struct unicode_format_arg_t {
14077 Py_UCS4 ch;
14078 int flags;
14079 Py_ssize_t width;
14080 int prec;
14081 int sign;
14082};
14083
Guido van Rossumd57fd912000-03-10 22:53:23 +000014084static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014085unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014086{
Victor Stinnera47082312012-10-04 02:19:54 +020014087 Py_ssize_t argidx = ctx->argidx;
14088
14089 if (argidx < ctx->arglen) {
14090 ctx->argidx++;
14091 if (ctx->arglen < 0)
14092 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014093 else
Victor Stinnera47082312012-10-04 02:19:54 +020014094 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014095 }
14096 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014097 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098 return NULL;
14099}
14100
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014101/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014102
Victor Stinnera47082312012-10-04 02:19:54 +020014103/* Format a float into the writer if the writer is not NULL, or into *p_output
14104 otherwise.
14105
14106 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014107static int
Victor Stinnera47082312012-10-04 02:19:54 +020014108formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14109 PyObject **p_output,
14110 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014111{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014112 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014113 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014114 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014115 int prec;
14116 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014117
Guido van Rossumd57fd912000-03-10 22:53:23 +000014118 x = PyFloat_AsDouble(v);
14119 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014120 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014121
Victor Stinnera47082312012-10-04 02:19:54 +020014122 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014123 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014124 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014125
Victor Stinnera47082312012-10-04 02:19:54 +020014126 if (arg->flags & F_ALT)
14127 dtoa_flags = Py_DTSF_ALT;
14128 else
14129 dtoa_flags = 0;
14130 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014131 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014132 return -1;
14133 len = strlen(p);
14134 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014135 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014136 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014137 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014138 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014139 }
14140 else
14141 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014142 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014143 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014144}
14145
Victor Stinnerd0880d52012-04-27 23:40:13 +020014146/* formatlong() emulates the format codes d, u, o, x and X, and
14147 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14148 * Python's regular ints.
14149 * Return value: a new PyUnicodeObject*, or NULL if error.
14150 * The output string is of the form
14151 * "-"? ("0x" | "0X")? digit+
14152 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14153 * set in flags. The case of hex digits will be correct,
14154 * There will be at least prec digits, zero-filled on the left if
14155 * necessary to get that many.
14156 * val object to be converted
14157 * flags bitmask of format flags; only F_ALT is looked at
14158 * prec minimum number of digits; 0-fill on left if needed
14159 * type a character in [duoxX]; u acts the same as d
14160 *
14161 * CAUTION: o, x and X conversions on regular ints can never
14162 * produce a '-' sign, but can for Python's unbounded ints.
14163 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014164PyObject *
14165_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014166{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014167 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014168 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014169 Py_ssize_t i;
14170 int sign; /* 1 if '-', else 0 */
14171 int len; /* number of characters */
14172 Py_ssize_t llen;
14173 int numdigits; /* len == numnondigits + numdigits */
14174 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014175
Victor Stinnerd0880d52012-04-27 23:40:13 +020014176 /* Avoid exceeding SSIZE_T_MAX */
14177 if (prec > INT_MAX-3) {
14178 PyErr_SetString(PyExc_OverflowError,
14179 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014180 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014181 }
14182
14183 assert(PyLong_Check(val));
14184
14185 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014186 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014187 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014188 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014189 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014190 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014191 /* int and int subclasses should print numerically when a numeric */
14192 /* format code is used (see issue18780) */
14193 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014194 break;
14195 case 'o':
14196 numnondigits = 2;
14197 result = PyNumber_ToBase(val, 8);
14198 break;
14199 case 'x':
14200 case 'X':
14201 numnondigits = 2;
14202 result = PyNumber_ToBase(val, 16);
14203 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014204 }
14205 if (!result)
14206 return NULL;
14207
14208 assert(unicode_modifiable(result));
14209 assert(PyUnicode_IS_READY(result));
14210 assert(PyUnicode_IS_ASCII(result));
14211
14212 /* To modify the string in-place, there can only be one reference. */
14213 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014214 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014215 PyErr_BadInternalCall();
14216 return NULL;
14217 }
14218 buf = PyUnicode_DATA(result);
14219 llen = PyUnicode_GET_LENGTH(result);
14220 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014221 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014222 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014223 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014224 return NULL;
14225 }
14226 len = (int)llen;
14227 sign = buf[0] == '-';
14228 numnondigits += sign;
14229 numdigits = len - numnondigits;
14230 assert(numdigits > 0);
14231
14232 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014233 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014234 (type == 'o' || type == 'x' || type == 'X'))) {
14235 assert(buf[sign] == '0');
14236 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14237 buf[sign+1] == 'o');
14238 numnondigits -= 2;
14239 buf += 2;
14240 len -= 2;
14241 if (sign)
14242 buf[0] = '-';
14243 assert(len == numnondigits + numdigits);
14244 assert(numdigits > 0);
14245 }
14246
14247 /* Fill with leading zeroes to meet minimum width. */
14248 if (prec > numdigits) {
14249 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14250 numnondigits + prec);
14251 char *b1;
14252 if (!r1) {
14253 Py_DECREF(result);
14254 return NULL;
14255 }
14256 b1 = PyBytes_AS_STRING(r1);
14257 for (i = 0; i < numnondigits; ++i)
14258 *b1++ = *buf++;
14259 for (i = 0; i < prec - numdigits; i++)
14260 *b1++ = '0';
14261 for (i = 0; i < numdigits; i++)
14262 *b1++ = *buf++;
14263 *b1 = '\0';
14264 Py_DECREF(result);
14265 result = r1;
14266 buf = PyBytes_AS_STRING(result);
14267 len = numnondigits + prec;
14268 }
14269
14270 /* Fix up case for hex conversions. */
14271 if (type == 'X') {
14272 /* Need to convert all lower case letters to upper case.
14273 and need to convert 0x to 0X (and -0x to -0X). */
14274 for (i = 0; i < len; i++)
14275 if (buf[i] >= 'a' && buf[i] <= 'x')
14276 buf[i] -= 'a'-'A';
14277 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014278 if (!PyUnicode_Check(result)
14279 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014280 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014281 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014282 Py_DECREF(result);
14283 result = unicode;
14284 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014285 else if (len != PyUnicode_GET_LENGTH(result)) {
14286 if (PyUnicode_Resize(&result, len) < 0)
14287 Py_CLEAR(result);
14288 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014289 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014290}
14291
Ethan Furmandf3ed242014-01-05 06:50:30 -080014292/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014293 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014294 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014295 * -1 and raise an exception on error */
14296static int
Victor Stinnera47082312012-10-04 02:19:54 +020014297mainformatlong(PyObject *v,
14298 struct unicode_format_arg_t *arg,
14299 PyObject **p_output,
14300 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014301{
14302 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014303 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014304
14305 if (!PyNumber_Check(v))
14306 goto wrongtype;
14307
Ethan Furman9ab74802014-03-21 06:38:46 -070014308 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014309 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014310 if (type == 'o' || type == 'x' || type == 'X') {
14311 iobj = PyNumber_Index(v);
14312 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014313 if (PyErr_ExceptionMatches(PyExc_TypeError))
14314 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014315 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014316 }
14317 }
14318 else {
14319 iobj = PyNumber_Long(v);
14320 if (iobj == NULL ) {
14321 if (PyErr_ExceptionMatches(PyExc_TypeError))
14322 goto wrongtype;
14323 return -1;
14324 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014325 }
14326 assert(PyLong_Check(iobj));
14327 }
14328 else {
14329 iobj = v;
14330 Py_INCREF(iobj);
14331 }
14332
14333 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014334 && arg->width == -1 && arg->prec == -1
14335 && !(arg->flags & (F_SIGN | F_BLANK))
14336 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014337 {
14338 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014339 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014340 int base;
14341
Victor Stinnera47082312012-10-04 02:19:54 +020014342 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014343 {
14344 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014345 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014346 case 'd':
14347 case 'i':
14348 case 'u':
14349 base = 10;
14350 break;
14351 case 'o':
14352 base = 8;
14353 break;
14354 case 'x':
14355 case 'X':
14356 base = 16;
14357 break;
14358 }
14359
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014360 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14361 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014362 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014363 }
14364 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014365 return 1;
14366 }
14367
Ethan Furmanb95b5612015-01-23 20:05:18 -080014368 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014369 Py_DECREF(iobj);
14370 if (res == NULL)
14371 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014372 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014373 return 0;
14374
14375wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014376 switch(type)
14377 {
14378 case 'o':
14379 case 'x':
14380 case 'X':
14381 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014382 "%%%c format: an integer is required, "
14383 "not %.200s",
14384 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014385 break;
14386 default:
14387 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014388 "%%%c format: a number is required, "
14389 "not %.200s",
14390 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014391 break;
14392 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014393 return -1;
14394}
14395
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014396static Py_UCS4
14397formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014398{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014399 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014400 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014401 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014402 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014403 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014404 goto onError;
14405 }
14406 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014407 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014408 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014409 /* make sure number is a type of integer */
14410 if (!PyLong_Check(v)) {
14411 iobj = PyNumber_Index(v);
14412 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014413 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014414 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014415 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014416 Py_DECREF(iobj);
14417 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014418 else {
14419 x = PyLong_AsLong(v);
14420 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014421 if (x == -1 && PyErr_Occurred())
14422 goto onError;
14423
Victor Stinner8faf8212011-12-08 22:14:11 +010014424 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014425 PyErr_SetString(PyExc_OverflowError,
14426 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014427 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014428 }
14429
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014430 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014431 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014432
Benjamin Peterson29060642009-01-31 22:14:21 +000014433 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014434 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014435 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014436 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014437}
14438
Victor Stinnera47082312012-10-04 02:19:54 +020014439/* Parse options of an argument: flags, width, precision.
14440 Handle also "%(name)" syntax.
14441
14442 Return 0 if the argument has been formatted into arg->str.
14443 Return 1 if the argument has been written into ctx->writer,
14444 Raise an exception and return -1 on error. */
14445static int
14446unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14447 struct unicode_format_arg_t *arg)
14448{
14449#define FORMAT_READ(ctx) \
14450 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14451
14452 PyObject *v;
14453
Victor Stinnera47082312012-10-04 02:19:54 +020014454 if (arg->ch == '(') {
14455 /* Get argument value from a dictionary. Example: "%(name)s". */
14456 Py_ssize_t keystart;
14457 Py_ssize_t keylen;
14458 PyObject *key;
14459 int pcount = 1;
14460
14461 if (ctx->dict == NULL) {
14462 PyErr_SetString(PyExc_TypeError,
14463 "format requires a mapping");
14464 return -1;
14465 }
14466 ++ctx->fmtpos;
14467 --ctx->fmtcnt;
14468 keystart = ctx->fmtpos;
14469 /* Skip over balanced parentheses */
14470 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14471 arg->ch = FORMAT_READ(ctx);
14472 if (arg->ch == ')')
14473 --pcount;
14474 else if (arg->ch == '(')
14475 ++pcount;
14476 ctx->fmtpos++;
14477 }
14478 keylen = ctx->fmtpos - keystart - 1;
14479 if (ctx->fmtcnt < 0 || pcount > 0) {
14480 PyErr_SetString(PyExc_ValueError,
14481 "incomplete format key");
14482 return -1;
14483 }
14484 key = PyUnicode_Substring(ctx->fmtstr,
14485 keystart, keystart + keylen);
14486 if (key == NULL)
14487 return -1;
14488 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014489 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014490 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014491 }
14492 ctx->args = PyObject_GetItem(ctx->dict, key);
14493 Py_DECREF(key);
14494 if (ctx->args == NULL)
14495 return -1;
14496 ctx->args_owned = 1;
14497 ctx->arglen = -1;
14498 ctx->argidx = -2;
14499 }
14500
14501 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014502 while (--ctx->fmtcnt >= 0) {
14503 arg->ch = FORMAT_READ(ctx);
14504 ctx->fmtpos++;
14505 switch (arg->ch) {
14506 case '-': arg->flags |= F_LJUST; continue;
14507 case '+': arg->flags |= F_SIGN; continue;
14508 case ' ': arg->flags |= F_BLANK; continue;
14509 case '#': arg->flags |= F_ALT; continue;
14510 case '0': arg->flags |= F_ZERO; continue;
14511 }
14512 break;
14513 }
14514
14515 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014516 if (arg->ch == '*') {
14517 v = unicode_format_getnextarg(ctx);
14518 if (v == NULL)
14519 return -1;
14520 if (!PyLong_Check(v)) {
14521 PyErr_SetString(PyExc_TypeError,
14522 "* wants int");
14523 return -1;
14524 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014525 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014526 if (arg->width == -1 && PyErr_Occurred())
14527 return -1;
14528 if (arg->width < 0) {
14529 arg->flags |= F_LJUST;
14530 arg->width = -arg->width;
14531 }
14532 if (--ctx->fmtcnt >= 0) {
14533 arg->ch = FORMAT_READ(ctx);
14534 ctx->fmtpos++;
14535 }
14536 }
14537 else if (arg->ch >= '0' && arg->ch <= '9') {
14538 arg->width = arg->ch - '0';
14539 while (--ctx->fmtcnt >= 0) {
14540 arg->ch = FORMAT_READ(ctx);
14541 ctx->fmtpos++;
14542 if (arg->ch < '0' || arg->ch > '9')
14543 break;
14544 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14545 mixing signed and unsigned comparison. Since arg->ch is between
14546 '0' and '9', casting to int is safe. */
14547 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14548 PyErr_SetString(PyExc_ValueError,
14549 "width too big");
14550 return -1;
14551 }
14552 arg->width = arg->width*10 + (arg->ch - '0');
14553 }
14554 }
14555
14556 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014557 if (arg->ch == '.') {
14558 arg->prec = 0;
14559 if (--ctx->fmtcnt >= 0) {
14560 arg->ch = FORMAT_READ(ctx);
14561 ctx->fmtpos++;
14562 }
14563 if (arg->ch == '*') {
14564 v = unicode_format_getnextarg(ctx);
14565 if (v == NULL)
14566 return -1;
14567 if (!PyLong_Check(v)) {
14568 PyErr_SetString(PyExc_TypeError,
14569 "* wants int");
14570 return -1;
14571 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014572 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014573 if (arg->prec == -1 && PyErr_Occurred())
14574 return -1;
14575 if (arg->prec < 0)
14576 arg->prec = 0;
14577 if (--ctx->fmtcnt >= 0) {
14578 arg->ch = FORMAT_READ(ctx);
14579 ctx->fmtpos++;
14580 }
14581 }
14582 else if (arg->ch >= '0' && arg->ch <= '9') {
14583 arg->prec = arg->ch - '0';
14584 while (--ctx->fmtcnt >= 0) {
14585 arg->ch = FORMAT_READ(ctx);
14586 ctx->fmtpos++;
14587 if (arg->ch < '0' || arg->ch > '9')
14588 break;
14589 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14590 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014591 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014592 return -1;
14593 }
14594 arg->prec = arg->prec*10 + (arg->ch - '0');
14595 }
14596 }
14597 }
14598
14599 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14600 if (ctx->fmtcnt >= 0) {
14601 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14602 if (--ctx->fmtcnt >= 0) {
14603 arg->ch = FORMAT_READ(ctx);
14604 ctx->fmtpos++;
14605 }
14606 }
14607 }
14608 if (ctx->fmtcnt < 0) {
14609 PyErr_SetString(PyExc_ValueError,
14610 "incomplete format");
14611 return -1;
14612 }
14613 return 0;
14614
14615#undef FORMAT_READ
14616}
14617
14618/* Format one argument. Supported conversion specifiers:
14619
14620 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014621 - "i", "d", "u": int or float
14622 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014623 - "e", "E", "f", "F", "g", "G": float
14624 - "c": int or str (1 character)
14625
Victor Stinner8dbd4212012-12-04 09:30:24 +010014626 When possible, the output is written directly into the Unicode writer
14627 (ctx->writer). A string is created when padding is required.
14628
Victor Stinnera47082312012-10-04 02:19:54 +020014629 Return 0 if the argument has been formatted into *p_str,
14630 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014631 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014632static int
14633unicode_format_arg_format(struct unicode_formatter_t *ctx,
14634 struct unicode_format_arg_t *arg,
14635 PyObject **p_str)
14636{
14637 PyObject *v;
14638 _PyUnicodeWriter *writer = &ctx->writer;
14639
14640 if (ctx->fmtcnt == 0)
14641 ctx->writer.overallocate = 0;
14642
Victor Stinnera47082312012-10-04 02:19:54 +020014643 v = unicode_format_getnextarg(ctx);
14644 if (v == NULL)
14645 return -1;
14646
Victor Stinnera47082312012-10-04 02:19:54 +020014647
14648 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014649 case 's':
14650 case 'r':
14651 case 'a':
14652 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14653 /* Fast path */
14654 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14655 return -1;
14656 return 1;
14657 }
14658
14659 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14660 *p_str = v;
14661 Py_INCREF(*p_str);
14662 }
14663 else {
14664 if (arg->ch == 's')
14665 *p_str = PyObject_Str(v);
14666 else if (arg->ch == 'r')
14667 *p_str = PyObject_Repr(v);
14668 else
14669 *p_str = PyObject_ASCII(v);
14670 }
14671 break;
14672
14673 case 'i':
14674 case 'd':
14675 case 'u':
14676 case 'o':
14677 case 'x':
14678 case 'X':
14679 {
14680 int ret = mainformatlong(v, arg, p_str, writer);
14681 if (ret != 0)
14682 return ret;
14683 arg->sign = 1;
14684 break;
14685 }
14686
14687 case 'e':
14688 case 'E':
14689 case 'f':
14690 case 'F':
14691 case 'g':
14692 case 'G':
14693 if (arg->width == -1 && arg->prec == -1
14694 && !(arg->flags & (F_SIGN | F_BLANK)))
14695 {
14696 /* Fast path */
14697 if (formatfloat(v, arg, NULL, writer) == -1)
14698 return -1;
14699 return 1;
14700 }
14701
14702 arg->sign = 1;
14703 if (formatfloat(v, arg, p_str, NULL) == -1)
14704 return -1;
14705 break;
14706
14707 case 'c':
14708 {
14709 Py_UCS4 ch = formatchar(v);
14710 if (ch == (Py_UCS4) -1)
14711 return -1;
14712 if (arg->width == -1 && arg->prec == -1) {
14713 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014714 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014715 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014716 return 1;
14717 }
14718 *p_str = PyUnicode_FromOrdinal(ch);
14719 break;
14720 }
14721
14722 default:
14723 PyErr_Format(PyExc_ValueError,
14724 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014725 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014726 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14727 (int)arg->ch,
14728 ctx->fmtpos - 1);
14729 return -1;
14730 }
14731 if (*p_str == NULL)
14732 return -1;
14733 assert (PyUnicode_Check(*p_str));
14734 return 0;
14735}
14736
14737static int
14738unicode_format_arg_output(struct unicode_formatter_t *ctx,
14739 struct unicode_format_arg_t *arg,
14740 PyObject *str)
14741{
14742 Py_ssize_t len;
14743 enum PyUnicode_Kind kind;
14744 void *pbuf;
14745 Py_ssize_t pindex;
14746 Py_UCS4 signchar;
14747 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014748 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014749 Py_ssize_t sublen;
14750 _PyUnicodeWriter *writer = &ctx->writer;
14751 Py_UCS4 fill;
14752
14753 fill = ' ';
14754 if (arg->sign && arg->flags & F_ZERO)
14755 fill = '0';
14756
14757 if (PyUnicode_READY(str) == -1)
14758 return -1;
14759
14760 len = PyUnicode_GET_LENGTH(str);
14761 if ((arg->width == -1 || arg->width <= len)
14762 && (arg->prec == -1 || arg->prec >= len)
14763 && !(arg->flags & (F_SIGN | F_BLANK)))
14764 {
14765 /* Fast path */
14766 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14767 return -1;
14768 return 0;
14769 }
14770
14771 /* Truncate the string for "s", "r" and "a" formats
14772 if the precision is set */
14773 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14774 if (arg->prec >= 0 && len > arg->prec)
14775 len = arg->prec;
14776 }
14777
14778 /* Adjust sign and width */
14779 kind = PyUnicode_KIND(str);
14780 pbuf = PyUnicode_DATA(str);
14781 pindex = 0;
14782 signchar = '\0';
14783 if (arg->sign) {
14784 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14785 if (ch == '-' || ch == '+') {
14786 signchar = ch;
14787 len--;
14788 pindex++;
14789 }
14790 else if (arg->flags & F_SIGN)
14791 signchar = '+';
14792 else if (arg->flags & F_BLANK)
14793 signchar = ' ';
14794 else
14795 arg->sign = 0;
14796 }
14797 if (arg->width < len)
14798 arg->width = len;
14799
14800 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014801 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014802 if (!(arg->flags & F_LJUST)) {
14803 if (arg->sign) {
14804 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014805 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014806 }
14807 else {
14808 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014809 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014810 }
14811 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014812 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14813 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014814 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014815 }
14816
Victor Stinnera47082312012-10-04 02:19:54 +020014817 buflen = arg->width;
14818 if (arg->sign && len == arg->width)
14819 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014820 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014821 return -1;
14822
14823 /* Write the sign if needed */
14824 if (arg->sign) {
14825 if (fill != ' ') {
14826 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14827 writer->pos += 1;
14828 }
14829 if (arg->width > len)
14830 arg->width--;
14831 }
14832
14833 /* Write the numeric prefix for "x", "X" and "o" formats
14834 if the alternate form is used.
14835 For example, write "0x" for the "%#x" format. */
14836 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14837 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14838 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14839 if (fill != ' ') {
14840 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14841 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14842 writer->pos += 2;
14843 pindex += 2;
14844 }
14845 arg->width -= 2;
14846 if (arg->width < 0)
14847 arg->width = 0;
14848 len -= 2;
14849 }
14850
14851 /* Pad left with the fill character if needed */
14852 if (arg->width > len && !(arg->flags & F_LJUST)) {
14853 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014854 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014855 writer->pos += sublen;
14856 arg->width = len;
14857 }
14858
14859 /* If padding with spaces: write sign if needed and/or numeric prefix if
14860 the alternate form is used */
14861 if (fill == ' ') {
14862 if (arg->sign) {
14863 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14864 writer->pos += 1;
14865 }
14866 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14867 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14868 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14869 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14870 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14871 writer->pos += 2;
14872 pindex += 2;
14873 }
14874 }
14875
14876 /* Write characters */
14877 if (len) {
14878 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14879 str, pindex, len);
14880 writer->pos += len;
14881 }
14882
14883 /* Pad right with the fill character if needed */
14884 if (arg->width > len) {
14885 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010014886 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020014887 writer->pos += sublen;
14888 }
14889 return 0;
14890}
14891
14892/* Helper of PyUnicode_Format(): format one arg.
14893 Return 0 on success, raise an exception and return -1 on error. */
14894static int
14895unicode_format_arg(struct unicode_formatter_t *ctx)
14896{
14897 struct unicode_format_arg_t arg;
14898 PyObject *str;
14899 int ret;
14900
Victor Stinner8dbd4212012-12-04 09:30:24 +010014901 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014902 if (arg.ch == '%') {
14903 ctx->fmtpos++;
14904 ctx->fmtcnt--;
14905 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
14906 return -1;
14907 return 0;
14908 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010014909 arg.flags = 0;
14910 arg.width = -1;
14911 arg.prec = -1;
14912 arg.sign = 0;
14913 str = NULL;
14914
Victor Stinnera47082312012-10-04 02:19:54 +020014915 ret = unicode_format_arg_parse(ctx, &arg);
14916 if (ret == -1)
14917 return -1;
14918
14919 ret = unicode_format_arg_format(ctx, &arg, &str);
14920 if (ret == -1)
14921 return -1;
14922
14923 if (ret != 1) {
14924 ret = unicode_format_arg_output(ctx, &arg, str);
14925 Py_DECREF(str);
14926 if (ret == -1)
14927 return -1;
14928 }
14929
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020014930 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014931 PyErr_SetString(PyExc_TypeError,
14932 "not all arguments converted during string formatting");
14933 return -1;
14934 }
14935 return 0;
14936}
14937
Alexander Belopolsky40018472011-02-26 01:02:56 +000014938PyObject *
14939PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014940{
Victor Stinnera47082312012-10-04 02:19:54 +020014941 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000014942
Guido van Rossumd57fd912000-03-10 22:53:23 +000014943 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014944 PyErr_BadInternalCall();
14945 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014946 }
Victor Stinnera47082312012-10-04 02:19:54 +020014947
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014948 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014949 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030014950
14951 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020014952 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14953 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14954 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14955 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014956
Victor Stinner8f674cc2013-04-17 23:02:17 +020014957 _PyUnicodeWriter_Init(&ctx.writer);
14958 ctx.writer.min_length = ctx.fmtcnt + 100;
14959 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020014960
Guido van Rossumd57fd912000-03-10 22:53:23 +000014961 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020014962 ctx.arglen = PyTuple_Size(args);
14963 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014964 }
14965 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014966 ctx.arglen = -1;
14967 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014968 }
Victor Stinnera47082312012-10-04 02:19:54 +020014969 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040014970 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020014971 ctx.dict = args;
14972 else
14973 ctx.dict = NULL;
14974 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014975
Victor Stinnera47082312012-10-04 02:19:54 +020014976 while (--ctx.fmtcnt >= 0) {
14977 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020014978 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020014979
14980 nonfmtpos = ctx.fmtpos++;
14981 while (ctx.fmtcnt >= 0 &&
14982 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14983 ctx.fmtpos++;
14984 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014985 }
Victor Stinnera47082312012-10-04 02:19:54 +020014986 if (ctx.fmtcnt < 0) {
14987 ctx.fmtpos--;
14988 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020014989 }
Victor Stinneree4544c2012-05-09 22:24:08 +020014990
Victor Stinnercfc4c132013-04-03 01:48:39 +020014991 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14992 nonfmtpos, ctx.fmtpos) < 0)
14993 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014994 }
14995 else {
Victor Stinnera47082312012-10-04 02:19:54 +020014996 ctx.fmtpos++;
14997 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000014998 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020014999 }
15000 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015001
Victor Stinnera47082312012-10-04 02:19:54 +020015002 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015003 PyErr_SetString(PyExc_TypeError,
15004 "not all arguments converted during string formatting");
15005 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015006 }
15007
Victor Stinnera47082312012-10-04 02:19:54 +020015008 if (ctx.args_owned) {
15009 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015010 }
Victor Stinnera47082312012-10-04 02:19:54 +020015011 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015012
Benjamin Peterson29060642009-01-31 22:14:21 +000015013 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015014 _PyUnicodeWriter_Dealloc(&ctx.writer);
15015 if (ctx.args_owned) {
15016 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015017 }
15018 return NULL;
15019}
15020
Jeremy Hylton938ace62002-07-17 16:30:39 +000015021static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015022unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15023
Tim Peters6d6c1a32001-08-02 04:15:00 +000015024static PyObject *
15025unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15026{
Benjamin Peterson29060642009-01-31 22:14:21 +000015027 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015028 static char *kwlist[] = {"object", "encoding", "errors", 0};
15029 char *encoding = NULL;
15030 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015031
Benjamin Peterson14339b62009-01-31 16:36:08 +000015032 if (type != &PyUnicode_Type)
15033 return unicode_subtype_new(type, args, kwds);
15034 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015035 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015036 return NULL;
15037 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015038 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015039 if (encoding == NULL && errors == NULL)
15040 return PyObject_Str(x);
15041 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015042 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015043}
15044
Guido van Rossume023fe02001-08-30 03:12:59 +000015045static PyObject *
15046unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15047{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015048 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015049 Py_ssize_t length, char_size;
15050 int share_wstr, share_utf8;
15051 unsigned int kind;
15052 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015053
Benjamin Peterson14339b62009-01-31 16:36:08 +000015054 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015055
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015056 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015057 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015058 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015059 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015060 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015061 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015062 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015063 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015064
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015065 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015066 if (self == NULL) {
15067 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015068 return NULL;
15069 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015070 kind = PyUnicode_KIND(unicode);
15071 length = PyUnicode_GET_LENGTH(unicode);
15072
15073 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015074#ifdef Py_DEBUG
15075 _PyUnicode_HASH(self) = -1;
15076#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015077 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015078#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015079 _PyUnicode_STATE(self).interned = 0;
15080 _PyUnicode_STATE(self).kind = kind;
15081 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015082 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015083 _PyUnicode_STATE(self).ready = 1;
15084 _PyUnicode_WSTR(self) = NULL;
15085 _PyUnicode_UTF8_LENGTH(self) = 0;
15086 _PyUnicode_UTF8(self) = NULL;
15087 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015088 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015089
15090 share_utf8 = 0;
15091 share_wstr = 0;
15092 if (kind == PyUnicode_1BYTE_KIND) {
15093 char_size = 1;
15094 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15095 share_utf8 = 1;
15096 }
15097 else if (kind == PyUnicode_2BYTE_KIND) {
15098 char_size = 2;
15099 if (sizeof(wchar_t) == 2)
15100 share_wstr = 1;
15101 }
15102 else {
15103 assert(kind == PyUnicode_4BYTE_KIND);
15104 char_size = 4;
15105 if (sizeof(wchar_t) == 4)
15106 share_wstr = 1;
15107 }
15108
15109 /* Ensure we won't overflow the length. */
15110 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15111 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015112 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015113 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015114 data = PyObject_MALLOC((length + 1) * char_size);
15115 if (data == NULL) {
15116 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015117 goto onError;
15118 }
15119
Victor Stinnerc3c74152011-10-02 20:39:55 +020015120 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015121 if (share_utf8) {
15122 _PyUnicode_UTF8_LENGTH(self) = length;
15123 _PyUnicode_UTF8(self) = data;
15124 }
15125 if (share_wstr) {
15126 _PyUnicode_WSTR_LENGTH(self) = length;
15127 _PyUnicode_WSTR(self) = (wchar_t *)data;
15128 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015129
Christian Heimesf051e432016-09-13 20:22:02 +020015130 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015131 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015132 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015133#ifdef Py_DEBUG
15134 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15135#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015136 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015137 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015138
15139onError:
15140 Py_DECREF(unicode);
15141 Py_DECREF(self);
15142 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015143}
15144
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015145PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015146"str(object='') -> str\n\
15147str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015148\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015149Create a new string object from the given object. If encoding or\n\
15150errors is specified, then the object must expose a data buffer\n\
15151that will be decoded using the given encoding and error handler.\n\
15152Otherwise, returns the result of object.__str__() (if defined)\n\
15153or repr(object).\n\
15154encoding defaults to sys.getdefaultencoding().\n\
15155errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015156
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015157static PyObject *unicode_iter(PyObject *seq);
15158
Guido van Rossumd57fd912000-03-10 22:53:23 +000015159PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015160 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015161 "str", /* tp_name */
15162 sizeof(PyUnicodeObject), /* tp_basicsize */
15163 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015164 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015165 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015166 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015167 0, /* tp_getattr */
15168 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015169 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015170 unicode_repr, /* tp_repr */
15171 &unicode_as_number, /* tp_as_number */
15172 &unicode_as_sequence, /* tp_as_sequence */
15173 &unicode_as_mapping, /* tp_as_mapping */
15174 (hashfunc) unicode_hash, /* tp_hash*/
15175 0, /* tp_call*/
15176 (reprfunc) unicode_str, /* tp_str */
15177 PyObject_GenericGetAttr, /* tp_getattro */
15178 0, /* tp_setattro */
15179 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015181 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15182 unicode_doc, /* tp_doc */
15183 0, /* tp_traverse */
15184 0, /* tp_clear */
15185 PyUnicode_RichCompare, /* tp_richcompare */
15186 0, /* tp_weaklistoffset */
15187 unicode_iter, /* tp_iter */
15188 0, /* tp_iternext */
15189 unicode_methods, /* tp_methods */
15190 0, /* tp_members */
15191 0, /* tp_getset */
15192 &PyBaseObject_Type, /* tp_base */
15193 0, /* tp_dict */
15194 0, /* tp_descr_get */
15195 0, /* tp_descr_set */
15196 0, /* tp_dictoffset */
15197 0, /* tp_init */
15198 0, /* tp_alloc */
15199 unicode_new, /* tp_new */
15200 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015201};
15202
15203/* Initialize the Unicode implementation */
15204
Victor Stinner331a6a52019-05-27 16:39:22 +020015205PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015206_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015207{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015208 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015209 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015210 0x000A, /* LINE FEED */
15211 0x000D, /* CARRIAGE RETURN */
15212 0x001C, /* FILE SEPARATOR */
15213 0x001D, /* GROUP SEPARATOR */
15214 0x001E, /* RECORD SEPARATOR */
15215 0x0085, /* NEXT LINE */
15216 0x2028, /* LINE SEPARATOR */
15217 0x2029, /* PARAGRAPH SEPARATOR */
15218 };
15219
Fred Drakee4315f52000-05-09 19:53:39 +000015220 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015221 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015222 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015223 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015224 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015225 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015226
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015227 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015228 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015229 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015230
15231 /* initialize the linebreak bloom filter */
15232 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015233 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015234 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015235
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015236 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015237 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015238 }
15239 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015240 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015241 }
15242 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015243 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015244 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015245 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015246}
15247
15248/* Finalize the Unicode implementation */
15249
Christian Heimesa156e092008-02-16 07:38:31 +000015250int
15251PyUnicode_ClearFreeList(void)
15252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015253 return 0;
Christian Heimesa156e092008-02-16 07:38:31 +000015254}
15255
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015256
Walter Dörwald16807132007-05-25 13:52:07 +000015257void
15258PyUnicode_InternInPlace(PyObject **p)
15259{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015260 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015262#ifdef Py_DEBUG
15263 assert(s != NULL);
15264 assert(_PyUnicode_CHECK(s));
15265#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015266 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015267 return;
15268#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015269 /* If it's a subclass, we don't really know what putting
15270 it in the interned dict might do. */
15271 if (!PyUnicode_CheckExact(s))
15272 return;
15273 if (PyUnicode_CHECK_INTERNED(s))
15274 return;
15275 if (interned == NULL) {
15276 interned = PyDict_New();
15277 if (interned == NULL) {
15278 PyErr_Clear(); /* Don't leave an exception */
15279 return;
15280 }
15281 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015282 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015283 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015284 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015285 if (t == NULL) {
15286 PyErr_Clear();
15287 return;
15288 }
15289 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015290 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015291 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015292 return;
15293 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015294 /* The two references in interned are not counted by refcnt.
15295 The deallocator will take care of this */
15296 Py_REFCNT(s) -= 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015297 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015298}
15299
15300void
15301PyUnicode_InternImmortal(PyObject **p)
15302{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015303 PyUnicode_InternInPlace(p);
15304 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015305 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015306 Py_INCREF(*p);
15307 }
Walter Dörwald16807132007-05-25 13:52:07 +000015308}
15309
15310PyObject *
15311PyUnicode_InternFromString(const char *cp)
15312{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015313 PyObject *s = PyUnicode_FromString(cp);
15314 if (s == NULL)
15315 return NULL;
15316 PyUnicode_InternInPlace(&s);
15317 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015318}
15319
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015320
15321#if defined(WITH_VALGRIND) || defined(__INSURE__)
15322static void
15323unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015324{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015325 PyObject *keys;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015326 PyObject *s;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015327 Py_ssize_t i, n;
15328 Py_ssize_t immortal_size = 0, mortal_size = 0;
Walter Dörwald16807132007-05-25 13:52:07 +000015329
Benjamin Peterson14339b62009-01-31 16:36:08 +000015330 if (interned == NULL || !PyDict_Check(interned))
15331 return;
15332 keys = PyDict_Keys(interned);
15333 if (keys == NULL || !PyList_Check(keys)) {
15334 PyErr_Clear();
15335 return;
15336 }
Walter Dörwald16807132007-05-25 13:52:07 +000015337
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015338 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015339 detector, interned unicode strings are not forcibly deallocated;
15340 rather, we give them their stolen references back, and then clear
15341 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015342
Benjamin Peterson14339b62009-01-31 16:36:08 +000015343 n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015344#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015345 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015346 n);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015347#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015348 for (i = 0; i < n; i++) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015349 s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015350 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015351 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015352 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015353 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015354 case SSTATE_NOT_INTERNED:
15355 /* XXX Shouldn't happen */
15356 break;
15357 case SSTATE_INTERNED_IMMORTAL:
15358 Py_REFCNT(s) += 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015359 immortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015360 break;
15361 case SSTATE_INTERNED_MORTAL:
15362 Py_REFCNT(s) += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015363 mortal_size += PyUnicode_GET_LENGTH(s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015364 break;
15365 default:
15366 Py_FatalError("Inconsistent interned string state.");
15367 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015368 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015369 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015370#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015371 fprintf(stderr, "total size of all interned strings: "
15372 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15373 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015374#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015375 Py_DECREF(keys);
15376 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015377 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015378}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015379#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015380
15381
15382/********************* Unicode Iterator **************************/
15383
15384typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015385 PyObject_HEAD
15386 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015387 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015388} unicodeiterobject;
15389
15390static void
15391unicodeiter_dealloc(unicodeiterobject *it)
15392{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015393 _PyObject_GC_UNTRACK(it);
15394 Py_XDECREF(it->it_seq);
15395 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015396}
15397
15398static int
15399unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15400{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015401 Py_VISIT(it->it_seq);
15402 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015403}
15404
15405static PyObject *
15406unicodeiter_next(unicodeiterobject *it)
15407{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015408 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015409
Benjamin Peterson14339b62009-01-31 16:36:08 +000015410 assert(it != NULL);
15411 seq = it->it_seq;
15412 if (seq == NULL)
15413 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015414 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015415
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015416 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15417 int kind = PyUnicode_KIND(seq);
15418 void *data = PyUnicode_DATA(seq);
15419 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15420 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015421 if (item != NULL)
15422 ++it->it_index;
15423 return item;
15424 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015425
Benjamin Peterson14339b62009-01-31 16:36:08 +000015426 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015427 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015428 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015429}
15430
15431static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015432unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015433{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015434 Py_ssize_t len = 0;
15435 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015436 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015437 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015438}
15439
15440PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15441
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015442static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015443unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015444{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015445 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015446 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015447 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015448 it->it_seq, it->it_index);
15449 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015450 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015451 if (u == NULL)
15452 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015453 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015454 }
15455}
15456
15457PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15458
15459static PyObject *
15460unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15461{
15462 Py_ssize_t index = PyLong_AsSsize_t(state);
15463 if (index == -1 && PyErr_Occurred())
15464 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015465 if (it->it_seq != NULL) {
15466 if (index < 0)
15467 index = 0;
15468 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15469 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15470 it->it_index = index;
15471 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015472 Py_RETURN_NONE;
15473}
15474
15475PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15476
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015477static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015478 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015479 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015480 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15481 reduce_doc},
15482 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15483 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015484 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015485};
15486
15487PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015488 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15489 "str_iterator", /* tp_name */
15490 sizeof(unicodeiterobject), /* tp_basicsize */
15491 0, /* tp_itemsize */
15492 /* methods */
15493 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015494 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015495 0, /* tp_getattr */
15496 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015497 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015498 0, /* tp_repr */
15499 0, /* tp_as_number */
15500 0, /* tp_as_sequence */
15501 0, /* tp_as_mapping */
15502 0, /* tp_hash */
15503 0, /* tp_call */
15504 0, /* tp_str */
15505 PyObject_GenericGetAttr, /* tp_getattro */
15506 0, /* tp_setattro */
15507 0, /* tp_as_buffer */
15508 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15509 0, /* tp_doc */
15510 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15511 0, /* tp_clear */
15512 0, /* tp_richcompare */
15513 0, /* tp_weaklistoffset */
15514 PyObject_SelfIter, /* tp_iter */
15515 (iternextfunc)unicodeiter_next, /* tp_iternext */
15516 unicodeiter_methods, /* tp_methods */
15517 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015518};
15519
15520static PyObject *
15521unicode_iter(PyObject *seq)
15522{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015523 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015524
Benjamin Peterson14339b62009-01-31 16:36:08 +000015525 if (!PyUnicode_Check(seq)) {
15526 PyErr_BadInternalCall();
15527 return NULL;
15528 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015529 if (PyUnicode_READY(seq) == -1)
15530 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015531 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15532 if (it == NULL)
15533 return NULL;
15534 it->it_index = 0;
15535 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015536 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015537 _PyObject_GC_TRACK(it);
15538 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015539}
15540
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015541
15542size_t
15543Py_UNICODE_strlen(const Py_UNICODE *u)
15544{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015545 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015546}
15547
15548Py_UNICODE*
15549Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15550{
15551 Py_UNICODE *u = s1;
15552 while ((*u++ = *s2++));
15553 return s1;
15554}
15555
15556Py_UNICODE*
15557Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15558{
15559 Py_UNICODE *u = s1;
15560 while ((*u++ = *s2++))
15561 if (n-- == 0)
15562 break;
15563 return s1;
15564}
15565
15566Py_UNICODE*
15567Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15568{
15569 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015570 u1 += wcslen(u1);
15571 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015572 return s1;
15573}
15574
15575int
15576Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15577{
15578 while (*s1 && *s2 && *s1 == *s2)
15579 s1++, s2++;
15580 if (*s1 && *s2)
15581 return (*s1 < *s2) ? -1 : +1;
15582 if (*s1)
15583 return 1;
15584 if (*s2)
15585 return -1;
15586 return 0;
15587}
15588
15589int
15590Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15591{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015592 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015593 for (; n != 0; n--) {
15594 u1 = *s1;
15595 u2 = *s2;
15596 if (u1 != u2)
15597 return (u1 < u2) ? -1 : +1;
15598 if (u1 == '\0')
15599 return 0;
15600 s1++;
15601 s2++;
15602 }
15603 return 0;
15604}
15605
15606Py_UNICODE*
15607Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15608{
15609 const Py_UNICODE *p;
15610 for (p = s; *p; p++)
15611 if (*p == c)
15612 return (Py_UNICODE*)p;
15613 return NULL;
15614}
15615
15616Py_UNICODE*
15617Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15618{
15619 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015620 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015621 while (p != s) {
15622 p--;
15623 if (*p == c)
15624 return (Py_UNICODE*)p;
15625 }
15626 return NULL;
15627}
Victor Stinner331ea922010-08-10 16:37:20 +000015628
Victor Stinner71133ff2010-09-01 23:43:53 +000015629Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015630PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015631{
Victor Stinner577db2c2011-10-11 22:12:48 +020015632 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015633 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015634
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015635 if (!PyUnicode_Check(unicode)) {
15636 PyErr_BadArgument();
15637 return NULL;
15638 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015639 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015640 if (u == NULL)
15641 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015642 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015643 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015644 PyErr_NoMemory();
15645 return NULL;
15646 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015647 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015648 size *= sizeof(Py_UNICODE);
15649 copy = PyMem_Malloc(size);
15650 if (copy == NULL) {
15651 PyErr_NoMemory();
15652 return NULL;
15653 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015654 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015655 return copy;
15656}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015657
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015658
Victor Stinner709d23d2019-05-02 14:56:30 -040015659static int
15660encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015661{
Victor Stinner709d23d2019-05-02 14:56:30 -040015662 int res;
15663 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15664 if (res == -2) {
15665 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15666 return -1;
15667 }
15668 if (res < 0) {
15669 PyErr_NoMemory();
15670 return -1;
15671 }
15672 return 0;
15673}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015674
Victor Stinner709d23d2019-05-02 14:56:30 -040015675
15676static int
15677config_get_codec_name(wchar_t **config_encoding)
15678{
15679 char *encoding;
15680 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15681 return -1;
15682 }
15683
15684 PyObject *name_obj = NULL;
15685 PyObject *codec = _PyCodec_Lookup(encoding);
15686 PyMem_RawFree(encoding);
15687
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015688 if (!codec)
15689 goto error;
15690
15691 name_obj = PyObject_GetAttrString(codec, "name");
15692 Py_CLEAR(codec);
15693 if (!name_obj) {
15694 goto error;
15695 }
15696
Victor Stinner709d23d2019-05-02 14:56:30 -040015697 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15698 Py_DECREF(name_obj);
15699 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015700 goto error;
15701 }
15702
Victor Stinner709d23d2019-05-02 14:56:30 -040015703 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15704 if (raw_wname == NULL) {
15705 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015706 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015707 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015708 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015709
15710 PyMem_RawFree(*config_encoding);
15711 *config_encoding = raw_wname;
15712
15713 PyMem_Free(wname);
15714 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015715
15716error:
15717 Py_XDECREF(codec);
15718 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015719 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015720}
15721
15722
Victor Stinner331a6a52019-05-27 16:39:22 +020015723static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015724init_stdio_encoding(PyInterpreterState *interp)
15725{
Victor Stinner709d23d2019-05-02 14:56:30 -040015726 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinner331a6a52019-05-27 16:39:22 +020015727 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015728 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015729 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015730 "of the stdio encoding");
15731 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015732 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015733}
15734
15735
Victor Stinner709d23d2019-05-02 14:56:30 -040015736static int
15737init_fs_codec(PyInterpreterState *interp)
15738{
Victor Stinner331a6a52019-05-27 16:39:22 +020015739 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015740
15741 _Py_error_handler error_handler;
15742 error_handler = get_error_handler_wide(config->filesystem_errors);
15743 if (error_handler == _Py_ERROR_UNKNOWN) {
15744 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15745 return -1;
15746 }
15747
15748 char *encoding, *errors;
15749 if (encode_wstr_utf8(config->filesystem_encoding,
15750 &encoding,
15751 "filesystem_encoding") < 0) {
15752 return -1;
15753 }
15754
15755 if (encode_wstr_utf8(config->filesystem_errors,
15756 &errors,
15757 "filesystem_errors") < 0) {
15758 PyMem_RawFree(encoding);
15759 return -1;
15760 }
15761
15762 PyMem_RawFree(interp->fs_codec.encoding);
15763 interp->fs_codec.encoding = encoding;
15764 PyMem_RawFree(interp->fs_codec.errors);
15765 interp->fs_codec.errors = errors;
15766 interp->fs_codec.error_handler = error_handler;
15767
15768 /* At this point, PyUnicode_EncodeFSDefault() and
15769 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15770 the C implementation of the filesystem encoding. */
15771
15772 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15773 global configuration variables. */
15774 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15775 interp->fs_codec.errors) < 0) {
15776 PyErr_NoMemory();
15777 return -1;
15778 }
15779 return 0;
15780}
15781
15782
Victor Stinner331a6a52019-05-27 16:39:22 +020015783static PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015784init_fs_encoding(PyInterpreterState *interp)
15785{
Victor Stinner709d23d2019-05-02 14:56:30 -040015786 /* Update the filesystem encoding to the normalized Python codec name.
15787 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15788 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015789 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015790 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015791 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015792 "of the filesystem encoding");
15793 }
15794
Victor Stinner709d23d2019-05-02 14:56:30 -040015795 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015796 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015797 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015798 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015799}
15800
15801
Victor Stinner331a6a52019-05-27 16:39:22 +020015802PyStatus
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015803_PyUnicode_InitEncodings(PyInterpreterState *interp)
15804{
Victor Stinner331a6a52019-05-27 16:39:22 +020015805 PyStatus status = init_fs_encoding(interp);
15806 if (_PyStatus_EXCEPTION(status)) {
15807 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015808 }
15809
15810 return init_stdio_encoding(interp);
15811}
15812
15813
Victor Stinner709d23d2019-05-02 14:56:30 -040015814#ifdef MS_WINDOWS
15815int
15816_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15817{
15818 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015819 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015820
15821 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15822 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15823 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15824 if (encoding == NULL || errors == NULL) {
15825 PyMem_RawFree(encoding);
15826 PyMem_RawFree(errors);
15827 PyErr_NoMemory();
15828 return -1;
15829 }
15830
15831 PyMem_RawFree(config->filesystem_encoding);
15832 config->filesystem_encoding = encoding;
15833 PyMem_RawFree(config->filesystem_errors);
15834 config->filesystem_errors = errors;
15835
15836 return init_fs_codec(interp);
15837}
15838#endif
15839
15840
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015841void
15842_PyUnicode_Fini(void)
15843{
15844#if defined(WITH_VALGRIND) || defined(__INSURE__)
15845 /* Insure++ is a memory analysis tool that aids in discovering
15846 * memory leaks and other memory problems. On Python exit, the
15847 * interned string dictionaries are flagged as being in use at exit
15848 * (which it is). Under normal circumstances, this is fine because
15849 * the memory will be automatically reclaimed by the system. Under
15850 * memory debugging, it's a huge source of useless noise, so we
15851 * trade off slower shutdown for less distraction in the memory
15852 * reports. -baw
15853 */
15854 unicode_release_interned();
15855#endif /* __INSURE__ */
15856
15857 Py_CLEAR(unicode_empty);
15858
15859 for (Py_ssize_t i = 0; i < 256; i++) {
15860 Py_CLEAR(unicode_latin1[i]);
15861 }
15862 _PyUnicode_ClearStaticStrings();
15863 (void)PyUnicode_ClearFreeList();
Victor Stinner709d23d2019-05-02 14:56:30 -040015864
15865 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
15866 PyMem_RawFree(interp->fs_codec.encoding);
15867 interp->fs_codec.encoding = NULL;
15868 PyMem_RawFree(interp->fs_codec.errors);
15869 interp->fs_codec.errors = NULL;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015870}
15871
15872
Georg Brandl66c221e2010-10-14 07:04:07 +000015873/* A _string module, to export formatter_parser and formatter_field_name_split
15874 to the string.Formatter class implemented in Python. */
15875
15876static PyMethodDef _string_methods[] = {
15877 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15878 METH_O, PyDoc_STR("split the argument as a field name")},
15879 {"formatter_parser", (PyCFunction) formatter_parser,
15880 METH_O, PyDoc_STR("parse the argument as a format string")},
15881 {NULL, NULL}
15882};
15883
15884static struct PyModuleDef _string_module = {
15885 PyModuleDef_HEAD_INIT,
15886 "_string",
15887 PyDoc_STR("string helper module"),
15888 0,
15889 _string_methods,
15890 NULL,
15891 NULL,
15892 NULL,
15893 NULL
15894};
15895
15896PyMODINIT_FUNC
15897PyInit__string(void)
15898{
15899 return PyModule_Create(&_string_module);
15900}
15901
15902
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000015903#ifdef __cplusplus
15904}
15905#endif