blob: 1e1f257dad0ff991d5ea038bc0fe4e638696ad94 [file] [log] [blame]
Tim Petersced69f82003-09-16 20:30:58 +00001/*
Guido van Rossumd57fd912000-03-10 22:53:23 +00002
3Unicode implementation based on original code by Fredrik Lundh,
Benjamin Peterson31616ea2011-10-01 00:11:09 -04004modified by Marc-Andre Lemburg <mal@lemburg.com>.
Guido van Rossumd57fd912000-03-10 22:53:23 +00005
Thomas Wouters477c8d52006-05-27 19:21:47 +00006Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
Guido van Rossum16b1ad92000-08-03 16:24:25 +00009Copyright (c) Corporation for National Research Initiatives.
Guido van Rossumd57fd912000-03-10 22:53:23 +000010
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000011--------------------------------------------------------------------
12The original string type implementation is:
Guido van Rossumd57fd912000-03-10 22:53:23 +000013
Benjamin Peterson29060642009-01-31 22:14:21 +000014 Copyright (c) 1999 by Secret Labs AB
15 Copyright (c) 1999 by Fredrik Lundh
Guido van Rossumd57fd912000-03-10 22:53:23 +000016
Fredrik Lundh0fdb90c2001-01-19 09:45:02 +000017By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000040
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000041#define PY_SSIZE_T_CLEAN
Guido van Rossumd57fd912000-03-10 22:53:23 +000042#include "Python.h"
Victor Stinnera15e2602020-04-08 02:01:56 +020043#include "pycore_abstract.h" // _PyIndex_Check()
Victor Stinner45876a92020-02-12 22:32:34 +010044#include "pycore_bytes_methods.h"
Victor Stinner9fc57a32018-11-07 00:44:03 +010045#include "pycore_fileutils.h"
Victor Stinner61691d82019-10-02 23:51:20 +020046#include "pycore_initconfig.h"
Victor Stinnerbcda8f12018-11-21 22:27:47 +010047#include "pycore_object.h"
Victor Stinner61691d82019-10-02 23:51:20 +020048#include "pycore_pathconfig.h"
Victor Stinner43fc3bb2019-05-02 11:54:20 -040049#include "pycore_pylifecycle.h"
Victor Stinner621cebe2018-11-12 16:53:38 +010050#include "pycore_pystate.h"
Marc-André Lemburgd49e5b42000-06-30 14:58:20 +000051#include "ucnhash.h"
Raymond Hettingerac2ef652015-07-04 16:04:44 -070052#include "stringlib/eq.h"
Guido van Rossumd57fd912000-03-10 22:53:23 +000053
Martin v. Löwis6238d2b2002-06-30 15:26:10 +000054#ifdef MS_WINDOWS
Guido van Rossumb7a40ba2000-03-28 02:01:52 +000055#include <windows.h>
56#endif
Guido van Rossumfd4b9572000-04-10 13:51:10 +000057
Victor Stinnerfecc4f22019-03-19 14:20:29 +010058/* Uncomment to display statistics on interned strings at exit when
59 using Valgrind or Insecure++. */
60/* #define INTERNED_STATS 1 */
61
62
Larry Hastings61272b72014-01-07 12:41:53 -080063/*[clinic input]
INADA Naoki15f94592017-01-16 21:49:13 +090064class str "PyObject *" "&PyUnicode_Type"
Larry Hastings61272b72014-01-07 12:41:53 -080065[clinic start generated code]*/
INADA Naoki3ae20562017-01-16 20:41:20 +090066/*[clinic end generated code: output=da39a3ee5e6b4b0d input=4884c934de622cf6]*/
67
68/*[python input]
69class Py_UCS4_converter(CConverter):
70 type = 'Py_UCS4'
71 converter = 'convert_uc'
72
73 def converter_init(self):
74 if self.default is not unspecified:
75 self.c_default = ascii(self.default)
76 if len(self.c_default) > 4 or self.c_default[0] != "'":
77 self.c_default = hex(ord(self.default))
78
79[python start generated code]*/
80/*[python end generated code: output=da39a3ee5e6b4b0d input=88f5dd06cd8e7a61]*/
Larry Hastings44e2eaa2013-11-23 15:37:55 -080081
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000082/* --- Globals ------------------------------------------------------------
83
Serhiy Storchaka05997252013-01-26 12:14:02 +020084NOTE: In the interpreter's initialization phase, some globals are currently
85 initialized dynamically as needed. In the process Unicode objects may
86 be created before the Unicode type is ready.
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000087
88*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000089
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000090
91#ifdef __cplusplus
92extern "C" {
93#endif
94
Victor Stinner8faf8212011-12-08 22:14:11 +010095/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
96#define MAX_UNICODE 0x10ffff
97
Victor Stinner910337b2011-10-03 03:20:16 +020098#ifdef Py_DEBUG
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020099# define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
Victor Stinner910337b2011-10-03 03:20:16 +0200100#else
101# define _PyUnicode_CHECK(op) PyUnicode_Check(op)
102#endif
Victor Stinnerfb5f5f22011-09-28 21:39:49 +0200103
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200104#define _PyUnicode_UTF8(op) \
105 (((PyCompactUnicodeObject*)(op))->utf8)
106#define PyUnicode_UTF8(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200107 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200108 assert(PyUnicode_IS_READY(op)), \
109 PyUnicode_IS_COMPACT_ASCII(op) ? \
110 ((char*)((PyASCIIObject*)(op) + 1)) : \
111 _PyUnicode_UTF8(op))
Victor Stinnerbc8b81b2011-09-29 19:31:34 +0200112#define _PyUnicode_UTF8_LENGTH(op) \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200113 (((PyCompactUnicodeObject*)(op))->utf8_length)
114#define PyUnicode_UTF8_LENGTH(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200115 (assert(_PyUnicode_CHECK(op)), \
Victor Stinnere90fe6a2011-10-01 16:48:13 +0200116 assert(PyUnicode_IS_READY(op)), \
117 PyUnicode_IS_COMPACT_ASCII(op) ? \
118 ((PyASCIIObject*)(op))->length : \
119 _PyUnicode_UTF8_LENGTH(op))
Victor Stinnera5f91632011-10-04 01:07:11 +0200120#define _PyUnicode_WSTR(op) \
121 (((PyASCIIObject*)(op))->wstr)
122#define _PyUnicode_WSTR_LENGTH(op) \
123 (((PyCompactUnicodeObject*)(op))->wstr_length)
124#define _PyUnicode_LENGTH(op) \
125 (((PyASCIIObject *)(op))->length)
126#define _PyUnicode_STATE(op) \
127 (((PyASCIIObject *)(op))->state)
128#define _PyUnicode_HASH(op) \
129 (((PyASCIIObject *)(op))->hash)
Victor Stinner910337b2011-10-03 03:20:16 +0200130#define _PyUnicode_KIND(op) \
131 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200132 ((PyASCIIObject *)(op))->state.kind)
Victor Stinner910337b2011-10-03 03:20:16 +0200133#define _PyUnicode_GET_LENGTH(op) \
134 (assert(_PyUnicode_CHECK(op)), \
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200135 ((PyASCIIObject *)(op))->length)
Victor Stinnera5f91632011-10-04 01:07:11 +0200136#define _PyUnicode_DATA_ANY(op) \
137 (((PyUnicodeObject*)(op))->data.any)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200138
Victor Stinner910337b2011-10-03 03:20:16 +0200139#undef PyUnicode_READY
140#define PyUnicode_READY(op) \
141 (assert(_PyUnicode_CHECK(op)), \
142 (PyUnicode_IS_READY(op) ? \
Victor Stinnera5f91632011-10-04 01:07:11 +0200143 0 : \
Victor Stinner7931d9a2011-11-04 00:22:48 +0100144 _PyUnicode_Ready(op)))
Victor Stinner910337b2011-10-03 03:20:16 +0200145
Victor Stinnerc379ead2011-10-03 12:52:27 +0200146#define _PyUnicode_SHARE_UTF8(op) \
147 (assert(_PyUnicode_CHECK(op)), \
148 assert(!PyUnicode_IS_COMPACT_ASCII(op)), \
149 (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
150#define _PyUnicode_SHARE_WSTR(op) \
151 (assert(_PyUnicode_CHECK(op)), \
152 (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
153
Victor Stinner829c0ad2011-10-03 01:08:02 +0200154/* true if the Unicode object has an allocated UTF-8 memory block
155 (not shared with other data) */
Victor Stinner910337b2011-10-03 03:20:16 +0200156#define _PyUnicode_HAS_UTF8_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200157 ((!PyUnicode_IS_COMPACT_ASCII(op) \
Victor Stinner910337b2011-10-03 03:20:16 +0200158 && _PyUnicode_UTF8(op) \
Victor Stinner829c0ad2011-10-03 01:08:02 +0200159 && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
Victor Stinner03490912011-10-03 23:45:12 +0200161/* true if the Unicode object has an allocated wstr memory block
162 (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op) \
Victor Stinnere699e5a2013-07-15 18:22:47 +0200164 ((_PyUnicode_WSTR(op) && \
Victor Stinner03490912011-10-03 23:45:12 +0200165 (!PyUnicode_IS_READY(op) || \
166 _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
167
Victor Stinner910337b2011-10-03 03:20:16 +0200168/* Generic helper macro to convert characters of different types.
169 from_type and to_type have to be valid type names, begin and end
170 are pointers to the source characters which should be of type
171 "from_type *". to is a pointer of type "to_type *" and points to the
172 buffer where the result characters are written to. */
173#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
174 do { \
Victor Stinner4a587072013-11-19 12:54:53 +0100175 to_type *_to = (to_type *)(to); \
Andy Lestere6be9b52020-02-11 20:28:35 -0600176 const from_type *_iter = (const from_type *)(begin);\
177 const from_type *_end = (const from_type *)(end);\
Antoine Pitroue459a082011-10-11 20:58:41 +0200178 Py_ssize_t n = (_end) - (_iter); \
179 const from_type *_unrolled_end = \
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +0200180 _iter + _Py_SIZE_ROUND_DOWN(n, 4); \
Antoine Pitroue459a082011-10-11 20:58:41 +0200181 while (_iter < (_unrolled_end)) { \
182 _to[0] = (to_type) _iter[0]; \
183 _to[1] = (to_type) _iter[1]; \
184 _to[2] = (to_type) _iter[2]; \
185 _to[3] = (to_type) _iter[3]; \
186 _iter += 4; _to += 4; \
Victor Stinner910337b2011-10-03 03:20:16 +0200187 } \
Antoine Pitroue459a082011-10-11 20:58:41 +0200188 while (_iter < (_end)) \
189 *_to++ = (to_type) *_iter++; \
Victor Stinner910337b2011-10-03 03:20:16 +0200190 } while (0)
Victor Stinner829c0ad2011-10-03 01:08:02 +0200191
Victor Stinnerfdfbf782015-10-09 00:33:49 +0200192#ifdef MS_WINDOWS
193 /* On Windows, overallocate by 50% is the best factor */
194# define OVERALLOCATE_FACTOR 2
195#else
196 /* On Linux, overallocate by 25% is the best factor */
197# define OVERALLOCATE_FACTOR 4
198#endif
199
Walter Dörwald16807132007-05-25 13:52:07 +0000200/* This dictionary holds all interned unicode strings. Note that references
201 to strings in this dictionary are *not* counted in the string's ob_refcnt.
202 When the interned string reaches a refcnt of 0 the string deallocation
203 function will delete the reference from this dictionary.
204
205 Another way to look at this is that to say that the actual reference
Guido van Rossum98297ee2007-11-06 21:34:58 +0000206 count of a string is: s->ob_refcnt + (s->state ? 2 : 0)
Walter Dörwald16807132007-05-25 13:52:07 +0000207*/
Serhiy Storchaka05997252013-01-26 12:14:02 +0200208static PyObject *interned = NULL;
Walter Dörwald16807132007-05-25 13:52:07 +0000209
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000210/* The empty Unicode object is shared to improve performance. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200211static PyObject *unicode_empty = NULL;
Serhiy Storchaka05997252013-01-26 12:14:02 +0200212
Serhiy Storchaka678db842013-01-26 12:16:36 +0200213#define _Py_INCREF_UNICODE_EMPTY() \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200214 do { \
215 if (unicode_empty != NULL) \
216 Py_INCREF(unicode_empty); \
217 else { \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200218 unicode_empty = PyUnicode_New(0, 0); \
219 if (unicode_empty != NULL) { \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200220 Py_INCREF(unicode_empty); \
Serhiy Storchaka678db842013-01-26 12:16:36 +0200221 assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
222 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200223 } \
Serhiy Storchaka05997252013-01-26 12:14:02 +0200224 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000225
Serhiy Storchaka678db842013-01-26 12:16:36 +0200226#define _Py_RETURN_UNICODE_EMPTY() \
227 do { \
228 _Py_INCREF_UNICODE_EMPTY(); \
229 return unicode_empty; \
230 } while (0)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000231
Victor Stinner59423e32018-11-26 13:40:01 +0100232static inline void
233unicode_fill(enum PyUnicode_Kind kind, void *data, Py_UCS4 value,
234 Py_ssize_t start, Py_ssize_t length)
235{
236 assert(0 <= start);
237 assert(kind != PyUnicode_WCHAR_KIND);
238 switch (kind) {
239 case PyUnicode_1BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100240 assert(value <= 0xff);
Victor Stinner59423e32018-11-26 13:40:01 +0100241 Py_UCS1 ch = (unsigned char)value;
242 Py_UCS1 *to = (Py_UCS1 *)data + start;
243 memset(to, ch, length);
244 break;
245 }
246 case PyUnicode_2BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100247 assert(value <= 0xffff);
Victor Stinner59423e32018-11-26 13:40:01 +0100248 Py_UCS2 ch = (Py_UCS2)value;
249 Py_UCS2 *to = (Py_UCS2 *)data + start;
250 const Py_UCS2 *end = to + length;
251 for (; to < end; ++to) *to = ch;
252 break;
253 }
254 case PyUnicode_4BYTE_KIND: {
Victor Stinner163403a2018-11-27 12:41:17 +0100255 assert(value <= MAX_UNICODE);
Victor Stinner59423e32018-11-26 13:40:01 +0100256 Py_UCS4 ch = value;
257 Py_UCS4 * to = (Py_UCS4 *)data + start;
258 const Py_UCS4 *end = to + length;
259 for (; to < end; ++to) *to = ch;
260 break;
261 }
262 default: Py_UNREACHABLE();
263 }
264}
265
266
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200267/* Forward declaration */
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700268static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200269_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
Inada Naoki770847a2019-06-24 12:30:24 +0900270static inline void
271_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer);
Victor Stinner709d23d2019-05-02 14:56:30 -0400272static PyObject *
273unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
274 const char *errors);
275static PyObject *
276unicode_decode_utf8(const char *s, Py_ssize_t size,
277 _Py_error_handler error_handler, const char *errors,
278 Py_ssize_t *consumed);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +0200279
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200280/* List of static strings. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200281static _Py_Identifier *static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +0200282
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000283/* Single character Unicode strings in the Latin-1 range are being
284 shared as well. */
Serhiy Storchaka678db842013-01-26 12:16:36 +0200285static PyObject *unicode_latin1[256] = {NULL};
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +0000286
Christian Heimes190d79e2008-01-30 11:58:22 +0000287/* Fast detection of the most frequent whitespace characters */
288const unsigned char _Py_ascii_whitespace[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000289 0, 0, 0, 0, 0, 0, 0, 0,
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000290/* case 0x0009: * CHARACTER TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000291/* case 0x000A: * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000292/* case 0x000B: * LINE TABULATION */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000293/* case 0x000C: * FORM FEED */
294/* case 0x000D: * CARRIAGE RETURN */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000295 0, 1, 1, 1, 1, 1, 0, 0,
296 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000297/* case 0x001C: * FILE SEPARATOR */
298/* case 0x001D: * GROUP SEPARATOR */
299/* case 0x001E: * RECORD SEPARATOR */
300/* case 0x001F: * UNIT SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000301 0, 0, 0, 0, 1, 1, 1, 1,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000302/* case 0x0020: * SPACE */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000303 1, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0,
305 0, 0, 0, 0, 0, 0, 0, 0,
306 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000307
Benjamin Peterson14339b62009-01-31 16:36:08 +0000308 0, 0, 0, 0, 0, 0, 0, 0,
309 0, 0, 0, 0, 0, 0, 0, 0,
310 0, 0, 0, 0, 0, 0, 0, 0,
311 0, 0, 0, 0, 0, 0, 0, 0,
312 0, 0, 0, 0, 0, 0, 0, 0,
313 0, 0, 0, 0, 0, 0, 0, 0,
314 0, 0, 0, 0, 0, 0, 0, 0,
315 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000316};
317
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200318/* forward */
Victor Stinnerfe226c02011-10-03 03:52:20 +0200319static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
Victor Stinner1b4f9ce2011-10-03 13:28:14 +0200320static PyObject* get_latin1_char(unsigned char ch);
Victor Stinner488fa492011-12-12 00:01:39 +0100321static int unicode_modifiable(PyObject *unicode);
322
Victor Stinnerfe226c02011-10-03 03:52:20 +0200323
Alexander Belopolsky40018472011-02-26 01:02:56 +0000324static PyObject *
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100325_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200326static PyObject *
327_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
328static PyObject *
329_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
330
331static PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +0000332unicode_encode_call_errorhandler(const char *errors,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000333 PyObject **errorHandler,const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +0100334 PyObject *unicode, PyObject **exceptionObject,
Martin v. Löwisdb12d452009-05-02 18:52:14 +0000335 Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
336
Alexander Belopolsky40018472011-02-26 01:02:56 +0000337static void
338raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +0300339 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +0100340 PyObject *unicode,
341 Py_ssize_t startpos, Py_ssize_t endpos,
342 const char *reason);
Victor Stinner31be90b2010-04-22 19:38:16 +0000343
Christian Heimes190d79e2008-01-30 11:58:22 +0000344/* Same for linebreaks */
Serhiy Storchaka2d06e842015-12-25 19:53:18 +0200345static const unsigned char ascii_linebreak[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +0000346 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000347/* 0x000A, * LINE FEED */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000348/* 0x000B, * LINE TABULATION */
349/* 0x000C, * FORM FEED */
Christian Heimes1a8501c2008-10-02 19:56:01 +0000350/* 0x000D, * CARRIAGE RETURN */
Florent Xicluna806d8cf2010-03-30 19:34:18 +0000351 0, 0, 1, 1, 1, 1, 0, 0,
Benjamin Peterson14339b62009-01-31 16:36:08 +0000352 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes1a8501c2008-10-02 19:56:01 +0000353/* 0x001C, * FILE SEPARATOR */
354/* 0x001D, * GROUP SEPARATOR */
355/* 0x001E, * RECORD SEPARATOR */
Benjamin Peterson14339b62009-01-31 16:36:08 +0000356 0, 0, 0, 0, 1, 1, 1, 0,
357 0, 0, 0, 0, 0, 0, 0, 0,
358 0, 0, 0, 0, 0, 0, 0, 0,
359 0, 0, 0, 0, 0, 0, 0, 0,
360 0, 0, 0, 0, 0, 0, 0, 0,
Christian Heimes190d79e2008-01-30 11:58:22 +0000361
Benjamin Peterson14339b62009-01-31 16:36:08 +0000362 0, 0, 0, 0, 0, 0, 0, 0,
363 0, 0, 0, 0, 0, 0, 0, 0,
364 0, 0, 0, 0, 0, 0, 0, 0,
365 0, 0, 0, 0, 0, 0, 0, 0,
366 0, 0, 0, 0, 0, 0, 0, 0,
367 0, 0, 0, 0, 0, 0, 0, 0,
368 0, 0, 0, 0, 0, 0, 0, 0,
369 0, 0, 0, 0, 0, 0, 0, 0
Christian Heimes190d79e2008-01-30 11:58:22 +0000370};
371
INADA Naoki3ae20562017-01-16 20:41:20 +0900372static int convert_uc(PyObject *obj, void *addr);
373
Serhiy Storchaka1009bf12015-04-03 23:53:51 +0300374#include "clinic/unicodeobject.c.h"
375
Victor Stinner3d4226a2018-08-29 22:21:32 +0200376_Py_error_handler
377_Py_GetErrorHandler(const char *errors)
Victor Stinner50149202015-09-22 00:26:54 +0200378{
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200379 if (errors == NULL || strcmp(errors, "strict") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200380 return _Py_ERROR_STRICT;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200381 }
382 if (strcmp(errors, "surrogateescape") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200383 return _Py_ERROR_SURROGATEESCAPE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200384 }
385 if (strcmp(errors, "replace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200386 return _Py_ERROR_REPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200387 }
388 if (strcmp(errors, "ignore") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200389 return _Py_ERROR_IGNORE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200390 }
391 if (strcmp(errors, "backslashreplace") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200392 return _Py_ERROR_BACKSLASHREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200393 }
394 if (strcmp(errors, "surrogatepass") == 0) {
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200395 return _Py_ERROR_SURROGATEPASS;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200396 }
397 if (strcmp(errors, "xmlcharrefreplace") == 0) {
Victor Stinner50149202015-09-22 00:26:54 +0200398 return _Py_ERROR_XMLCHARREFREPLACE;
Victor Stinner1a05d6c2016-09-02 12:12:23 +0200399 }
Victor Stinner50149202015-09-22 00:26:54 +0200400 return _Py_ERROR_OTHER;
401}
402
Victor Stinner709d23d2019-05-02 14:56:30 -0400403
404static _Py_error_handler
405get_error_handler_wide(const wchar_t *errors)
406{
407 if (errors == NULL || wcscmp(errors, L"strict") == 0) {
408 return _Py_ERROR_STRICT;
409 }
410 if (wcscmp(errors, L"surrogateescape") == 0) {
411 return _Py_ERROR_SURROGATEESCAPE;
412 }
413 if (wcscmp(errors, L"replace") == 0) {
414 return _Py_ERROR_REPLACE;
415 }
416 if (wcscmp(errors, L"ignore") == 0) {
417 return _Py_ERROR_IGNORE;
418 }
419 if (wcscmp(errors, L"backslashreplace") == 0) {
420 return _Py_ERROR_BACKSLASHREPLACE;
421 }
422 if (wcscmp(errors, L"surrogatepass") == 0) {
423 return _Py_ERROR_SURROGATEPASS;
424 }
425 if (wcscmp(errors, L"xmlcharrefreplace") == 0) {
426 return _Py_ERROR_XMLCHARREFREPLACE;
427 }
428 return _Py_ERROR_OTHER;
429}
430
431
Victor Stinner22eb6892019-06-26 00:51:05 +0200432static inline int
433unicode_check_encoding_errors(const char *encoding, const char *errors)
434{
435 if (encoding == NULL && errors == NULL) {
436 return 0;
437 }
438
439 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
440#ifndef Py_DEBUG
441 /* In release mode, only check in development mode (-X dev) */
442 if (!interp->config.dev_mode) {
443 return 0;
444 }
445#else
446 /* Always check in debug mode */
447#endif
448
449 /* Avoid calling _PyCodec_Lookup() and PyCodec_LookupError() before the
450 codec registry is ready: before_PyUnicode_InitEncodings() is called. */
451 if (!interp->fs_codec.encoding) {
452 return 0;
453 }
454
Victor Stinnerd8acf0d2020-04-07 16:07:42 +0200455 /* Disable checks during Python finalization. For example, it allows to
456 call _PyObject_Dump() during finalization for debugging purpose. */
457 if (interp->finalizing) {
458 return 0;
459 }
460
Victor Stinner22eb6892019-06-26 00:51:05 +0200461 if (encoding != NULL) {
462 PyObject *handler = _PyCodec_Lookup(encoding);
463 if (handler == NULL) {
464 return -1;
465 }
466 Py_DECREF(handler);
467 }
468
469 if (errors != NULL) {
470 PyObject *handler = PyCodec_LookupError(errors);
471 if (handler == NULL) {
472 return -1;
473 }
474 Py_DECREF(handler);
475 }
476 return 0;
477}
478
479
Ezio Melotti48a2f8f2011-09-29 00:18:19 +0300480/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
481 This function is kept for backward compatibility with the old API. */
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000482Py_UNICODE
Marc-André Lemburg6c6bfb72001-07-20 17:39:11 +0000483PyUnicode_GetMax(void)
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000484{
Fredrik Lundh8f455852001-06-27 18:59:43 +0000485#ifdef Py_UNICODE_WIDE
Benjamin Peterson14339b62009-01-31 16:36:08 +0000486 return 0x10FFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000487#else
Benjamin Peterson14339b62009-01-31 16:36:08 +0000488 /* This is actually an illegal character, so it should
489 not be passed to unichr. */
490 return 0xFFFF;
Martin v. Löwisce9b5a52001-06-27 06:28:56 +0000491#endif
492}
493
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +0200494int
Victor Stinner7931d9a2011-11-04 00:22:48 +0100495_PyUnicode_CheckConsistency(PyObject *op, int check_content)
Victor Stinner910337b2011-10-03 03:20:16 +0200496{
Victor Stinner68762572019-10-07 18:42:01 +0200497#define CHECK(expr) \
498 do { if (!(expr)) { _PyObject_ASSERT_FAILED_MSG(op, Py_STRINGIFY(expr)); } } while (0)
499
Victor Stinner910337b2011-10-03 03:20:16 +0200500 PyASCIIObject *ascii;
501 unsigned int kind;
502
Victor Stinner68762572019-10-07 18:42:01 +0200503 assert(op != NULL);
504 CHECK(PyUnicode_Check(op));
Victor Stinner910337b2011-10-03 03:20:16 +0200505
506 ascii = (PyASCIIObject *)op;
507 kind = ascii->state.kind;
508
Victor Stinnera3b334d2011-10-03 13:53:37 +0200509 if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
Victor Stinner68762572019-10-07 18:42:01 +0200510 CHECK(kind == PyUnicode_1BYTE_KIND);
511 CHECK(ascii->state.ready == 1);
Victor Stinner910337b2011-10-03 03:20:16 +0200512 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200513 else {
Victor Stinner85041a52011-10-03 14:42:39 +0200514 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
Victor Stinner7f11ad42011-10-04 00:00:20 +0200515 void *data;
Victor Stinner910337b2011-10-03 03:20:16 +0200516
Victor Stinnera41463c2011-10-04 01:05:08 +0200517 if (ascii->state.compact == 1) {
518 data = compact + 1;
Victor Stinner68762572019-10-07 18:42:01 +0200519 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200520 || kind == PyUnicode_2BYTE_KIND
521 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200522 CHECK(ascii->state.ascii == 0);
523 CHECK(ascii->state.ready == 1);
524 CHECK(compact->utf8 != data);
Victor Stinnere30c0a12011-11-04 20:54:05 +0100525 }
526 else {
Victor Stinnera41463c2011-10-04 01:05:08 +0200527 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
528
529 data = unicode->data.any;
530 if (kind == PyUnicode_WCHAR_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200531 CHECK(ascii->length == 0);
532 CHECK(ascii->hash == -1);
533 CHECK(ascii->state.compact == 0);
534 CHECK(ascii->state.ascii == 0);
535 CHECK(ascii->state.ready == 0);
536 CHECK(ascii->state.interned == SSTATE_NOT_INTERNED);
537 CHECK(ascii->wstr != NULL);
538 CHECK(data == NULL);
539 CHECK(compact->utf8 == NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200540 }
541 else {
Victor Stinner68762572019-10-07 18:42:01 +0200542 CHECK(kind == PyUnicode_1BYTE_KIND
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200543 || kind == PyUnicode_2BYTE_KIND
544 || kind == PyUnicode_4BYTE_KIND);
Victor Stinner68762572019-10-07 18:42:01 +0200545 CHECK(ascii->state.compact == 0);
546 CHECK(ascii->state.ready == 1);
547 CHECK(data != NULL);
Victor Stinnera41463c2011-10-04 01:05:08 +0200548 if (ascii->state.ascii) {
Victor Stinner68762572019-10-07 18:42:01 +0200549 CHECK(compact->utf8 == data);
550 CHECK(compact->utf8_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200551 }
552 else
Victor Stinner68762572019-10-07 18:42:01 +0200553 CHECK(compact->utf8 != data);
Victor Stinnera41463c2011-10-04 01:05:08 +0200554 }
555 }
556 if (kind != PyUnicode_WCHAR_KIND) {
Victor Stinner7f11ad42011-10-04 00:00:20 +0200557 if (
558#if SIZEOF_WCHAR_T == 2
559 kind == PyUnicode_2BYTE_KIND
560#else
561 kind == PyUnicode_4BYTE_KIND
562#endif
563 )
Victor Stinnera41463c2011-10-04 01:05:08 +0200564 {
Victor Stinner68762572019-10-07 18:42:01 +0200565 CHECK(ascii->wstr == data);
566 CHECK(compact->wstr_length == ascii->length);
Victor Stinnera41463c2011-10-04 01:05:08 +0200567 } else
Victor Stinner68762572019-10-07 18:42:01 +0200568 CHECK(ascii->wstr != data);
Victor Stinner910337b2011-10-03 03:20:16 +0200569 }
Victor Stinnera41463c2011-10-04 01:05:08 +0200570
571 if (compact->utf8 == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200572 CHECK(compact->utf8_length == 0);
Victor Stinnera41463c2011-10-04 01:05:08 +0200573 if (ascii->wstr == NULL)
Victor Stinner68762572019-10-07 18:42:01 +0200574 CHECK(compact->wstr_length == 0);
Victor Stinner910337b2011-10-03 03:20:16 +0200575 }
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200576
577 /* check that the best kind is used: O(n) operation */
578 if (check_content && kind != PyUnicode_WCHAR_KIND) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200579 Py_ssize_t i;
580 Py_UCS4 maxchar = 0;
Victor Stinner718fbf02012-04-26 00:39:37 +0200581 void *data;
582 Py_UCS4 ch;
583
584 data = PyUnicode_DATA(ascii);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200585 for (i=0; i < ascii->length; i++)
586 {
Victor Stinner718fbf02012-04-26 00:39:37 +0200587 ch = PyUnicode_READ(kind, data, i);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200588 if (ch > maxchar)
589 maxchar = ch;
590 }
591 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinner77faf692011-11-20 18:56:05 +0100592 if (ascii->state.ascii == 0) {
Victor Stinner68762572019-10-07 18:42:01 +0200593 CHECK(maxchar >= 128);
594 CHECK(maxchar <= 255);
Victor Stinner77faf692011-11-20 18:56:05 +0100595 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200596 else
Victor Stinner68762572019-10-07 18:42:01 +0200597 CHECK(maxchar < 128);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200598 }
Victor Stinner77faf692011-11-20 18:56:05 +0100599 else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinner68762572019-10-07 18:42:01 +0200600 CHECK(maxchar >= 0x100);
601 CHECK(maxchar <= 0xFFFF);
Victor Stinner77faf692011-11-20 18:56:05 +0100602 }
603 else {
Victor Stinner68762572019-10-07 18:42:01 +0200604 CHECK(maxchar >= 0x10000);
605 CHECK(maxchar <= MAX_UNICODE);
Victor Stinner77faf692011-11-20 18:56:05 +0100606 }
Victor Stinner68762572019-10-07 18:42:01 +0200607 CHECK(PyUnicode_READ(kind, data, ascii->length) == 0);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +0200608 }
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400609 return 1;
Victor Stinner68762572019-10-07 18:42:01 +0200610
611#undef CHECK
Benjamin Petersonccc51c12011-10-03 19:34:12 -0400612}
Victor Stinner0fc91ee2019-04-12 21:51:34 +0200613
Victor Stinner910337b2011-10-03 03:20:16 +0200614
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100615static PyObject*
616unicode_result_wchar(PyObject *unicode)
617{
618#ifndef Py_DEBUG
619 Py_ssize_t len;
620
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100621 len = _PyUnicode_WSTR_LENGTH(unicode);
622 if (len == 0) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100623 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200624 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100625 }
626
627 if (len == 1) {
628 wchar_t ch = _PyUnicode_WSTR(unicode)[0];
Victor Stinnerd21b58c2013-02-26 00:15:54 +0100629 if ((Py_UCS4)ch < 256) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100630 PyObject *latin1_char = get_latin1_char((unsigned char)ch);
631 Py_DECREF(unicode);
632 return latin1_char;
633 }
634 }
635
636 if (_PyUnicode_Ready(unicode) < 0) {
Victor Stinneraa771272012-10-04 02:32:58 +0200637 Py_DECREF(unicode);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100638 return NULL;
639 }
640#else
Victor Stinneraa771272012-10-04 02:32:58 +0200641 assert(Py_REFCNT(unicode) == 1);
642
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100643 /* don't make the result ready in debug mode to ensure that the caller
644 makes the string ready before using it */
645 assert(_PyUnicode_CheckConsistency(unicode, 1));
646#endif
647 return unicode;
648}
649
650static PyObject*
651unicode_result_ready(PyObject *unicode)
652{
653 Py_ssize_t length;
654
655 length = PyUnicode_GET_LENGTH(unicode);
656 if (length == 0) {
657 if (unicode != unicode_empty) {
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100658 Py_DECREF(unicode);
Serhiy Storchaka678db842013-01-26 12:16:36 +0200659 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100660 }
661 return unicode_empty;
662 }
663
664 if (length == 1) {
Victor Stinner69ed0f42013-04-09 21:48:24 +0200665 void *data = PyUnicode_DATA(unicode);
666 int kind = PyUnicode_KIND(unicode);
667 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +0100668 if (ch < 256) {
669 PyObject *latin1_char = unicode_latin1[ch];
670 if (latin1_char != NULL) {
671 if (unicode != latin1_char) {
672 Py_INCREF(latin1_char);
673 Py_DECREF(unicode);
674 }
675 return latin1_char;
676 }
677 else {
678 assert(_PyUnicode_CheckConsistency(unicode, 1));
679 Py_INCREF(unicode);
680 unicode_latin1[ch] = unicode;
681 return unicode;
682 }
683 }
684 }
685
686 assert(_PyUnicode_CheckConsistency(unicode, 1));
687 return unicode;
688}
689
690static PyObject*
691unicode_result(PyObject *unicode)
692{
693 assert(_PyUnicode_CHECK(unicode));
694 if (PyUnicode_IS_READY(unicode))
695 return unicode_result_ready(unicode);
696 else
697 return unicode_result_wchar(unicode);
698}
699
Victor Stinnerc4b49542011-12-11 22:44:26 +0100700static PyObject*
701unicode_result_unchanged(PyObject *unicode)
702{
703 if (PyUnicode_CheckExact(unicode)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -0500704 if (PyUnicode_READY(unicode) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +0100705 return NULL;
706 Py_INCREF(unicode);
707 return unicode;
708 }
709 else
710 /* Subtype -- return genuine unicode string with the same value. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +0100711 return _PyUnicode_Copy(unicode);
Victor Stinnerc4b49542011-12-11 22:44:26 +0100712}
713
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200714/* Implementation of the "backslashreplace" error handler for 8-bit encodings:
715 ASCII, Latin1, UTF-8, etc. */
716static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200717backslashreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200718 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
719{
Victor Stinnerad771582015-10-09 12:38:53 +0200720 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200721 Py_UCS4 ch;
722 enum PyUnicode_Kind kind;
723 void *data;
724
725 assert(PyUnicode_IS_READY(unicode));
726 kind = PyUnicode_KIND(unicode);
727 data = PyUnicode_DATA(unicode);
728
729 size = 0;
730 /* determine replacement size */
731 for (i = collstart; i < collend; ++i) {
732 Py_ssize_t incr;
733
734 ch = PyUnicode_READ(kind, data, i);
735 if (ch < 0x100)
736 incr = 2+2;
737 else if (ch < 0x10000)
738 incr = 2+4;
739 else {
740 assert(ch <= MAX_UNICODE);
Victor Stinner3fa36ff2015-10-09 03:37:11 +0200741 incr = 2+8;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200742 }
743 if (size > PY_SSIZE_T_MAX - incr) {
744 PyErr_SetString(PyExc_OverflowError,
745 "encoded result is too long for a Python string");
746 return NULL;
747 }
748 size += incr;
749 }
750
Victor Stinnerad771582015-10-09 12:38:53 +0200751 str = _PyBytesWriter_Prepare(writer, str, size);
752 if (str == NULL)
753 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200754
755 /* generate replacement */
756 for (i = collstart; i < collend; ++i) {
757 ch = PyUnicode_READ(kind, data, i);
Victor Stinner797485e2015-10-09 03:17:30 +0200758 *str++ = '\\';
759 if (ch >= 0x00010000) {
760 *str++ = 'U';
761 *str++ = Py_hexdigits[(ch>>28)&0xf];
762 *str++ = Py_hexdigits[(ch>>24)&0xf];
763 *str++ = Py_hexdigits[(ch>>20)&0xf];
764 *str++ = Py_hexdigits[(ch>>16)&0xf];
765 *str++ = Py_hexdigits[(ch>>12)&0xf];
766 *str++ = Py_hexdigits[(ch>>8)&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200767 }
Victor Stinner797485e2015-10-09 03:17:30 +0200768 else if (ch >= 0x100) {
769 *str++ = 'u';
770 *str++ = Py_hexdigits[(ch>>12)&0xf];
771 *str++ = Py_hexdigits[(ch>>8)&0xf];
772 }
773 else
774 *str++ = 'x';
775 *str++ = Py_hexdigits[(ch>>4)&0xf];
776 *str++ = Py_hexdigits[ch&0xf];
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200777 }
778 return str;
779}
780
781/* Implementation of the "xmlcharrefreplace" error handler for 8-bit encodings:
782 ASCII, Latin1, UTF-8, etc. */
783static char*
Victor Stinnerad771582015-10-09 12:38:53 +0200784xmlcharrefreplace(_PyBytesWriter *writer, char *str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200785 PyObject *unicode, Py_ssize_t collstart, Py_ssize_t collend)
786{
Victor Stinnerad771582015-10-09 12:38:53 +0200787 Py_ssize_t size, i;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200788 Py_UCS4 ch;
789 enum PyUnicode_Kind kind;
790 void *data;
791
792 assert(PyUnicode_IS_READY(unicode));
793 kind = PyUnicode_KIND(unicode);
794 data = PyUnicode_DATA(unicode);
795
796 size = 0;
797 /* determine replacement size */
798 for (i = collstart; i < collend; ++i) {
799 Py_ssize_t incr;
800
801 ch = PyUnicode_READ(kind, data, i);
802 if (ch < 10)
803 incr = 2+1+1;
804 else if (ch < 100)
805 incr = 2+2+1;
806 else if (ch < 1000)
807 incr = 2+3+1;
808 else if (ch < 10000)
809 incr = 2+4+1;
810 else if (ch < 100000)
811 incr = 2+5+1;
812 else if (ch < 1000000)
813 incr = 2+6+1;
814 else {
815 assert(ch <= MAX_UNICODE);
816 incr = 2+7+1;
817 }
818 if (size > PY_SSIZE_T_MAX - incr) {
819 PyErr_SetString(PyExc_OverflowError,
820 "encoded result is too long for a Python string");
821 return NULL;
822 }
823 size += incr;
824 }
825
Victor Stinnerad771582015-10-09 12:38:53 +0200826 str = _PyBytesWriter_Prepare(writer, str, size);
827 if (str == NULL)
828 return NULL;
Victor Stinnere7bf86c2015-10-09 01:39:28 +0200829
830 /* generate replacement */
831 for (i = collstart; i < collend; ++i) {
832 str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
833 }
834 return str;
835}
836
Thomas Wouters477c8d52006-05-27 19:21:47 +0000837/* --- Bloom Filters ----------------------------------------------------- */
838
839/* stuff to implement simple "bloom filters" for Unicode characters.
840 to keep things simple, we use a single bitmask, using the least 5
841 bits from each unicode characters as the bit index. */
842
843/* the linebreak mask is set up by Unicode_Init below */
844
Antoine Pitrouf068f942010-01-13 14:19:12 +0000845#if LONG_BIT >= 128
846#define BLOOM_WIDTH 128
847#elif LONG_BIT >= 64
848#define BLOOM_WIDTH 64
849#elif LONG_BIT >= 32
850#define BLOOM_WIDTH 32
851#else
852#error "LONG_BIT is smaller than 32"
853#endif
854
Thomas Wouters477c8d52006-05-27 19:21:47 +0000855#define BLOOM_MASK unsigned long
856
Serhiy Storchaka05997252013-01-26 12:14:02 +0200857static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000858
Antoine Pitrouf068f942010-01-13 14:19:12 +0000859#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000860
Benjamin Peterson29060642009-01-31 22:14:21 +0000861#define BLOOM_LINEBREAK(ch) \
862 ((ch) < 128U ? ascii_linebreak[(ch)] : \
863 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
Thomas Wouters477c8d52006-05-27 19:21:47 +0000864
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700865static inline BLOOM_MASK
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200866make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
Thomas Wouters477c8d52006-05-27 19:21:47 +0000867{
Victor Stinnera85af502013-04-09 21:53:54 +0200868#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN) \
869 do { \
870 TYPE *data = (TYPE *)PTR; \
871 TYPE *end = data + LEN; \
872 Py_UCS4 ch; \
873 for (; data != end; data++) { \
874 ch = *data; \
875 MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
876 } \
877 break; \
878 } while (0)
879
Thomas Wouters477c8d52006-05-27 19:21:47 +0000880 /* calculate simple bloom-style bitmask for a given unicode string */
881
Antoine Pitrouf068f942010-01-13 14:19:12 +0000882 BLOOM_MASK mask;
Thomas Wouters477c8d52006-05-27 19:21:47 +0000883
884 mask = 0;
Victor Stinnera85af502013-04-09 21:53:54 +0200885 switch (kind) {
886 case PyUnicode_1BYTE_KIND:
887 BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
888 break;
889 case PyUnicode_2BYTE_KIND:
890 BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
891 break;
892 case PyUnicode_4BYTE_KIND:
893 BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
894 break;
895 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700896 Py_UNREACHABLE();
Victor Stinnera85af502013-04-09 21:53:54 +0200897 }
Thomas Wouters477c8d52006-05-27 19:21:47 +0000898 return mask;
Victor Stinnera85af502013-04-09 21:53:54 +0200899
900#undef BLOOM_UPDATE
Thomas Wouters477c8d52006-05-27 19:21:47 +0000901}
902
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300903static int
904ensure_unicode(PyObject *obj)
905{
906 if (!PyUnicode_Check(obj)) {
907 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +0200908 "must be str, not %.100s",
909 Py_TYPE(obj)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +0300910 return -1;
911 }
912 return PyUnicode_READY(obj);
913}
914
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200915/* Compilation of templated routines */
916
917#include "stringlib/asciilib.h"
918#include "stringlib/fastsearch.h"
919#include "stringlib/partition.h"
920#include "stringlib/split.h"
921#include "stringlib/count.h"
922#include "stringlib/find.h"
923#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200924#include "stringlib/undef.h"
925
926#include "stringlib/ucs1lib.h"
927#include "stringlib/fastsearch.h"
928#include "stringlib/partition.h"
929#include "stringlib/split.h"
930#include "stringlib/count.h"
931#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300932#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200933#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200934#include "stringlib/undef.h"
935
936#include "stringlib/ucs2lib.h"
937#include "stringlib/fastsearch.h"
938#include "stringlib/partition.h"
939#include "stringlib/split.h"
940#include "stringlib/count.h"
941#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300942#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200943#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200944#include "stringlib/undef.h"
945
946#include "stringlib/ucs4lib.h"
947#include "stringlib/fastsearch.h"
948#include "stringlib/partition.h"
949#include "stringlib/split.h"
950#include "stringlib/count.h"
951#include "stringlib/find.h"
Serhiy Storchakae2cef882013-04-13 22:45:04 +0300952#include "stringlib/replace.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200953#include "stringlib/find_max_char.h"
Antoine Pitroudd4e2f02011-10-13 00:02:27 +0200954#include "stringlib/undef.h"
955
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200956#include "stringlib/unicodedefs.h"
957#include "stringlib/fastsearch.h"
958#include "stringlib/count.h"
959#include "stringlib/find.h"
Antoine Pitrou0a3229d2011-11-21 20:39:13 +0100960#include "stringlib/undef.h"
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200961
Guido van Rossumd57fd912000-03-10 22:53:23 +0000962/* --- Unicode Object ----------------------------------------------------- */
963
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -0700964static inline Py_ssize_t
965findchar(const void *s, int kind,
966 Py_ssize_t size, Py_UCS4 ch,
967 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200968{
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200969 switch (kind) {
970 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200971 if ((Py_UCS1) ch != ch)
972 return -1;
973 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600974 return ucs1lib_find_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200975 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600976 return ucs1lib_rfind_char((const Py_UCS1 *) s, size, (Py_UCS1) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200977 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200978 if ((Py_UCS2) ch != ch)
979 return -1;
980 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600981 return ucs2lib_find_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200982 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600983 return ucs2lib_rfind_char((const Py_UCS2 *) s, size, (Py_UCS2) ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200984 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200985 if (direction > 0)
Andy Lestere6be9b52020-02-11 20:28:35 -0600986 return ucs4lib_find_char((const Py_UCS4 *) s, size, ch);
Serhiy Storchaka413fdce2015-11-14 15:42:17 +0200987 else
Andy Lestere6be9b52020-02-11 20:28:35 -0600988 return ucs4lib_rfind_char((const Py_UCS4 *) s, size, ch);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +0200989 default:
Barry Warsawb2e57942017-09-14 18:13:16 -0700990 Py_UNREACHABLE();
Victor Stinner9e7a1bc2011-10-13 00:18:12 +0200991 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +0200992}
993
Victor Stinnerafffce42012-10-03 23:03:17 +0200994#ifdef Py_DEBUG
Martin Panter6245cb32016-04-15 02:14:19 +0000995/* Fill the data of a Unicode string with invalid characters to detect bugs
Victor Stinnerafffce42012-10-03 23:03:17 +0200996 earlier.
997
998 _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
999 ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
1000 invalid character in Unicode 6.0. */
1001static void
1002unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
1003{
1004 int kind = PyUnicode_KIND(unicode);
1005 Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
1006 Py_ssize_t length = _PyUnicode_LENGTH(unicode);
1007 if (length <= old_length)
1008 return;
1009 memset(data + old_length * kind, 0xff, (length - old_length) * kind);
1010}
1011#endif
1012
Victor Stinnerfe226c02011-10-03 03:52:20 +02001013static PyObject*
1014resize_compact(PyObject *unicode, Py_ssize_t length)
1015{
1016 Py_ssize_t char_size;
1017 Py_ssize_t struct_size;
1018 Py_ssize_t new_size;
1019 int share_wstr;
Victor Stinner84def372011-12-11 20:04:56 +01001020 PyObject *new_unicode;
Victor Stinnerafffce42012-10-03 23:03:17 +02001021#ifdef Py_DEBUG
1022 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1023#endif
1024
Victor Stinner79891572012-05-03 13:43:07 +02001025 assert(unicode_modifiable(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001026 assert(PyUnicode_IS_READY(unicode));
Victor Stinner488fa492011-12-12 00:01:39 +01001027 assert(PyUnicode_IS_COMPACT(unicode));
1028
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001029 char_size = PyUnicode_KIND(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001030 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerfe226c02011-10-03 03:52:20 +02001031 struct_size = sizeof(PyASCIIObject);
1032 else
1033 struct_size = sizeof(PyCompactUnicodeObject);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001034 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001035
Victor Stinnerfe226c02011-10-03 03:52:20 +02001036 if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
1037 PyErr_NoMemory();
1038 return NULL;
1039 }
1040 new_size = (struct_size + (length + 1) * char_size);
1041
Serhiy Storchaka7aa69082015-12-03 01:02:03 +02001042 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
1043 PyObject_DEL(_PyUnicode_UTF8(unicode));
1044 _PyUnicode_UTF8(unicode) = NULL;
1045 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1046 }
Victor Stinner49932fe2020-02-03 17:55:05 +01001047#ifdef Py_REF_DEBUG
1048 _Py_RefTotal--;
1049#endif
1050#ifdef Py_TRACE_REFS
Victor Stinner84def372011-12-11 20:04:56 +01001051 _Py_ForgetReference(unicode);
Victor Stinner49932fe2020-02-03 17:55:05 +01001052#endif
Victor Stinner84def372011-12-11 20:04:56 +01001053
Serhiy Storchaka20b39b22014-09-28 11:27:24 +03001054 new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
Victor Stinner84def372011-12-11 20:04:56 +01001055 if (new_unicode == NULL) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001056 _Py_NewReference(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001057 PyErr_NoMemory();
1058 return NULL;
1059 }
Victor Stinner84def372011-12-11 20:04:56 +01001060 unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001061 _Py_NewReference(unicode);
Victor Stinner84def372011-12-11 20:04:56 +01001062
Victor Stinnerfe226c02011-10-03 03:52:20 +02001063 _PyUnicode_LENGTH(unicode) = length;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001064 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001065 _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
Victor Stinner488fa492011-12-12 00:01:39 +01001066 if (!PyUnicode_IS_ASCII(unicode))
Victor Stinnerc379ead2011-10-03 12:52:27 +02001067 _PyUnicode_WSTR_LENGTH(unicode) = length;
1068 }
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001069 else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
1070 PyObject_DEL(_PyUnicode_WSTR(unicode));
1071 _PyUnicode_WSTR(unicode) = NULL;
Victor Stinner5bc03a62016-01-27 16:56:53 +01001072 if (!PyUnicode_IS_ASCII(unicode))
1073 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Victor Stinnerbbbac2e2013-02-07 23:12:46 +01001074 }
Victor Stinnerafffce42012-10-03 23:03:17 +02001075#ifdef Py_DEBUG
1076 unicode_fill_invalid(unicode, old_length);
1077#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001078 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
1079 length, 0);
Victor Stinner79891572012-05-03 13:43:07 +02001080 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001081 return unicode;
1082}
1083
Alexander Belopolsky40018472011-02-26 01:02:56 +00001084static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001085resize_inplace(PyObject *unicode, Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001086{
Victor Stinner95663112011-10-04 01:03:50 +02001087 wchar_t *wstr;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001088 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001089 assert(!PyUnicode_IS_COMPACT(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001090 assert(Py_REFCNT(unicode) == 1);
Tim Petersced69f82003-09-16 20:30:58 +00001091
Victor Stinnerfe226c02011-10-03 03:52:20 +02001092 if (PyUnicode_IS_READY(unicode)) {
1093 Py_ssize_t char_size;
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001094 int share_wstr, share_utf8;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001095 void *data;
Victor Stinnerafffce42012-10-03 23:03:17 +02001096#ifdef Py_DEBUG
1097 Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
1098#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001099
1100 data = _PyUnicode_DATA_ANY(unicode);
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001101 char_size = PyUnicode_KIND(unicode);
Victor Stinnerc379ead2011-10-03 12:52:27 +02001102 share_wstr = _PyUnicode_SHARE_WSTR(unicode);
1103 share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001104
1105 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
1106 PyErr_NoMemory();
1107 return -1;
1108 }
1109 new_size = (length + 1) * char_size;
1110
Victor Stinner7a9105a2011-12-12 00:13:42 +01001111 if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
1112 {
1113 PyObject_DEL(_PyUnicode_UTF8(unicode));
1114 _PyUnicode_UTF8(unicode) = NULL;
1115 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1116 }
1117
Victor Stinnerfe226c02011-10-03 03:52:20 +02001118 data = (PyObject *)PyObject_REALLOC(data, new_size);
1119 if (data == NULL) {
1120 PyErr_NoMemory();
1121 return -1;
1122 }
1123 _PyUnicode_DATA_ANY(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001124 if (share_wstr) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001125 _PyUnicode_WSTR(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001126 _PyUnicode_WSTR_LENGTH(unicode) = length;
1127 }
1128 if (share_utf8) {
Victor Stinner1c8d0c72011-10-03 12:11:00 +02001129 _PyUnicode_UTF8(unicode) = data;
Victor Stinnerc379ead2011-10-03 12:52:27 +02001130 _PyUnicode_UTF8_LENGTH(unicode) = length;
1131 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001132 _PyUnicode_LENGTH(unicode) = length;
1133 PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
Victor Stinnerafffce42012-10-03 23:03:17 +02001134#ifdef Py_DEBUG
1135 unicode_fill_invalid(unicode, old_length);
1136#endif
Victor Stinner95663112011-10-04 01:03:50 +02001137 if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001138 assert(_PyUnicode_CheckConsistency(unicode, 0));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001139 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001140 }
Victor Stinnerfe226c02011-10-03 03:52:20 +02001141 }
Victor Stinner95663112011-10-04 01:03:50 +02001142 assert(_PyUnicode_WSTR(unicode) != NULL);
1143
1144 /* check for integer overflow */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001145 if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
Victor Stinner95663112011-10-04 01:03:50 +02001146 PyErr_NoMemory();
1147 return -1;
1148 }
Victor Stinner7a9105a2011-12-12 00:13:42 +01001149 new_size = sizeof(wchar_t) * (length + 1);
Victor Stinner95663112011-10-04 01:03:50 +02001150 wstr = _PyUnicode_WSTR(unicode);
Victor Stinner7a9105a2011-12-12 00:13:42 +01001151 wstr = PyObject_REALLOC(wstr, new_size);
Victor Stinner95663112011-10-04 01:03:50 +02001152 if (!wstr) {
1153 PyErr_NoMemory();
1154 return -1;
1155 }
1156 _PyUnicode_WSTR(unicode) = wstr;
1157 _PyUnicode_WSTR(unicode)[length] = 0;
1158 _PyUnicode_WSTR_LENGTH(unicode) = length;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001159 assert(_PyUnicode_CheckConsistency(unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001160 return 0;
1161}
1162
Victor Stinnerfe226c02011-10-03 03:52:20 +02001163static PyObject*
1164resize_copy(PyObject *unicode, Py_ssize_t length)
1165{
1166 Py_ssize_t copy_length;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001167 if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02001168 PyObject *copy;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001169
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03001170 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001171
1172 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
1173 if (copy == NULL)
1174 return NULL;
1175
1176 copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
Victor Stinnerd3f08822012-05-29 12:57:52 +02001177 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001178 return copy;
Victor Stinner8cfcbed2011-10-03 23:19:21 +02001179 }
1180 else {
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001181 PyObject *w;
Victor Stinner7a9105a2011-12-12 00:13:42 +01001182
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001183 w = (PyObject*)_PyUnicode_New(length);
Victor Stinnerfe226c02011-10-03 03:52:20 +02001184 if (w == NULL)
1185 return NULL;
1186 copy_length = _PyUnicode_WSTR_LENGTH(unicode);
1187 copy_length = Py_MIN(copy_length, length);
Christian Heimesf051e432016-09-13 20:22:02 +02001188 memcpy(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
Victor Stinnerc6cf1ba2012-10-23 02:54:47 +02001189 copy_length * sizeof(wchar_t));
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001190 return w;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001191 }
1192}
1193
Guido van Rossumd57fd912000-03-10 22:53:23 +00001194/* We allocate one more byte to make sure the string is
Martin v. Löwis47383402007-08-15 07:32:56 +00001195 Ux0000 terminated; some code (e.g. new_identifier)
1196 relies on that.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001197
1198 XXX This allocator could further be enhanced by assuring that the
Benjamin Peterson29060642009-01-31 22:14:21 +00001199 free list never reduces its size below 1.
Guido van Rossumd57fd912000-03-10 22:53:23 +00001200
1201*/
1202
Alexander Belopolsky40018472011-02-26 01:02:56 +00001203static PyUnicodeObject *
1204_PyUnicode_New(Py_ssize_t length)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001205{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001206 PyUnicodeObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001207 size_t new_size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001208
Thomas Wouters477c8d52006-05-27 19:21:47 +00001209 /* Optimization for empty strings */
Guido van Rossumd57fd912000-03-10 22:53:23 +00001210 if (length == 0 && unicode_empty != NULL) {
1211 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001212 return (PyUnicodeObject*)unicode_empty;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001213 }
1214
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001215 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -07001216 if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +00001217 return (PyUnicodeObject *)PyErr_NoMemory();
1218 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001219 if (length < 0) {
1220 PyErr_SetString(PyExc_SystemError,
1221 "Negative size passed to _PyUnicode_New");
1222 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00001223 }
1224
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001225 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
1226 if (unicode == NULL)
1227 return NULL;
1228 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
Victor Stinner68b674c2013-10-29 19:31:43 +01001229
1230 _PyUnicode_WSTR_LENGTH(unicode) = length;
1231 _PyUnicode_HASH(unicode) = -1;
1232 _PyUnicode_STATE(unicode).interned = 0;
1233 _PyUnicode_STATE(unicode).kind = 0;
1234 _PyUnicode_STATE(unicode).compact = 0;
1235 _PyUnicode_STATE(unicode).ready = 0;
1236 _PyUnicode_STATE(unicode).ascii = 0;
1237 _PyUnicode_DATA_ANY(unicode) = NULL;
1238 _PyUnicode_LENGTH(unicode) = 0;
1239 _PyUnicode_UTF8(unicode) = NULL;
1240 _PyUnicode_UTF8_LENGTH(unicode) = 0;
1241
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001242 _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
1243 if (!_PyUnicode_WSTR(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001244 Py_DECREF(unicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00001245 PyErr_NoMemory();
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001246 return NULL;
Guido van Rossum3c1bb802000-04-27 20:13:50 +00001247 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001248
Jeremy Hyltond8082792003-09-16 19:41:39 +00001249 /* Initialize the first element to guard against cases where
Tim Petersced69f82003-09-16 20:30:58 +00001250 * the caller fails before initializing str -- unicode_resize()
1251 * reads str[0], and the Keep-Alive optimization can keep memory
1252 * allocated for str alive across a call to unicode_dealloc(unicode).
1253 * We don't want unicode_resize to read uninitialized memory in
1254 * that case.
1255 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001256 _PyUnicode_WSTR(unicode)[0] = 0;
1257 _PyUnicode_WSTR(unicode)[length] = 0;
Victor Stinner68b674c2013-10-29 19:31:43 +01001258
Victor Stinner7931d9a2011-11-04 00:22:48 +01001259 assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
Guido van Rossumd57fd912000-03-10 22:53:23 +00001260 return unicode;
1261}
1262
Victor Stinnerf42dc442011-10-02 23:33:16 +02001263static const char*
1264unicode_kind_name(PyObject *unicode)
1265{
Victor Stinner42dfd712011-10-03 14:41:45 +02001266 /* don't check consistency: unicode_kind_name() is called from
1267 _PyUnicode_Dump() */
Victor Stinnerf42dc442011-10-02 23:33:16 +02001268 if (!PyUnicode_IS_COMPACT(unicode))
1269 {
1270 if (!PyUnicode_IS_READY(unicode))
1271 return "wstr";
Benjamin Petersonead6b532011-12-20 17:23:42 -06001272 switch (PyUnicode_KIND(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001273 {
1274 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001275 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001276 return "legacy ascii";
1277 else
1278 return "legacy latin1";
1279 case PyUnicode_2BYTE_KIND:
1280 return "legacy UCS2";
1281 case PyUnicode_4BYTE_KIND:
1282 return "legacy UCS4";
1283 default:
1284 return "<legacy invalid kind>";
1285 }
1286 }
1287 assert(PyUnicode_IS_READY(unicode));
Benjamin Petersonead6b532011-12-20 17:23:42 -06001288 switch (PyUnicode_KIND(unicode)) {
Victor Stinnerf42dc442011-10-02 23:33:16 +02001289 case PyUnicode_1BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001290 if (PyUnicode_IS_ASCII(unicode))
Victor Stinnerf42dc442011-10-02 23:33:16 +02001291 return "ascii";
1292 else
Victor Stinnera3b334d2011-10-03 13:53:37 +02001293 return "latin1";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001294 case PyUnicode_2BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001295 return "UCS2";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001296 case PyUnicode_4BYTE_KIND:
Victor Stinnera3b334d2011-10-03 13:53:37 +02001297 return "UCS4";
Victor Stinnerf42dc442011-10-02 23:33:16 +02001298 default:
1299 return "<invalid compact kind>";
1300 }
1301}
1302
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001303#ifdef Py_DEBUG
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001304/* Functions wrapping macros for use in debugger */
Victor Stinnera42de742018-11-22 10:25:22 +01001305char *_PyUnicode_utf8(void *unicode_raw){
1306 PyObject *unicode = _PyObject_CAST(unicode_raw);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001307 return PyUnicode_UTF8(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001308}
1309
Victor Stinnera42de742018-11-22 10:25:22 +01001310void *_PyUnicode_compact_data(void *unicode_raw) {
1311 PyObject *unicode = _PyObject_CAST(unicode_raw);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001312 return _PyUnicode_COMPACT_DATA(unicode);
1313}
Victor Stinnera42de742018-11-22 10:25:22 +01001314void *_PyUnicode_data(void *unicode_raw) {
1315 PyObject *unicode = _PyObject_CAST(unicode_raw);
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001316 printf("obj %p\n", (void*)unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001317 printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1318 printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1319 printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1320 printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1321 printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1322 return PyUnicode_DATA(unicode);
1323}
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001324
1325void
1326_PyUnicode_Dump(PyObject *op)
1327{
1328 PyASCIIObject *ascii = (PyASCIIObject *)op;
Victor Stinnera849a4b2011-10-03 12:12:11 +02001329 PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1330 PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1331 void *data;
Victor Stinner0d60e872011-10-23 19:47:19 +02001332
Victor Stinnera849a4b2011-10-03 12:12:11 +02001333 if (ascii->state.compact)
Victor Stinner0d60e872011-10-23 19:47:19 +02001334 {
1335 if (ascii->state.ascii)
1336 data = (ascii + 1);
1337 else
1338 data = (compact + 1);
1339 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001340 else
1341 data = unicode->data.any;
Victor Stinner293f3f52014-07-01 08:57:10 +02001342 printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1343 unicode_kind_name(op), ascii->length);
Victor Stinner0d60e872011-10-23 19:47:19 +02001344
Victor Stinnera849a4b2011-10-03 12:12:11 +02001345 if (ascii->wstr == data)
1346 printf("shared ");
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001347 printf("wstr=%p", (void *)ascii->wstr);
Victor Stinner0d60e872011-10-23 19:47:19 +02001348
Victor Stinnera3b334d2011-10-03 13:53:37 +02001349 if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
Victor Stinner293f3f52014-07-01 08:57:10 +02001350 printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
Victor Stinnera849a4b2011-10-03 12:12:11 +02001351 if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1352 printf("shared ");
Victor Stinner293f3f52014-07-01 08:57:10 +02001353 printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
Zackery Spytz1a2252e2019-05-06 10:56:51 -06001354 (void *)compact->utf8, compact->utf8_length);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001355 }
Victor Stinnera849a4b2011-10-03 12:12:11 +02001356 printf(", data=%p\n", data);
Victor Stinnerfe0c1552011-10-03 02:59:31 +02001357}
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001358#endif
1359
1360PyObject *
1361PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1362{
1363 PyObject *obj;
1364 PyCompactUnicodeObject *unicode;
1365 void *data;
Victor Stinner8f825062012-04-27 13:55:39 +02001366 enum PyUnicode_Kind kind;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001367 int is_sharing, is_ascii;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001368 Py_ssize_t char_size;
1369 Py_ssize_t struct_size;
1370
1371 /* Optimization for empty strings */
1372 if (size == 0 && unicode_empty != NULL) {
1373 Py_INCREF(unicode_empty);
Victor Stinnera464fc12011-10-02 20:39:30 +02001374 return unicode_empty;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001375 }
1376
Victor Stinner9e9d6892011-10-04 01:02:02 +02001377 is_ascii = 0;
1378 is_sharing = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001379 struct_size = sizeof(PyCompactUnicodeObject);
1380 if (maxchar < 128) {
Victor Stinner8f825062012-04-27 13:55:39 +02001381 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001382 char_size = 1;
1383 is_ascii = 1;
1384 struct_size = sizeof(PyASCIIObject);
1385 }
1386 else if (maxchar < 256) {
Victor Stinner8f825062012-04-27 13:55:39 +02001387 kind = PyUnicode_1BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001388 char_size = 1;
1389 }
1390 else if (maxchar < 65536) {
Victor Stinner8f825062012-04-27 13:55:39 +02001391 kind = PyUnicode_2BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001392 char_size = 2;
1393 if (sizeof(wchar_t) == 2)
1394 is_sharing = 1;
1395 }
1396 else {
Victor Stinnerc9590ad2012-03-04 01:34:37 +01001397 if (maxchar > MAX_UNICODE) {
1398 PyErr_SetString(PyExc_SystemError,
1399 "invalid maximum character passed to PyUnicode_New");
1400 return NULL;
1401 }
Victor Stinner8f825062012-04-27 13:55:39 +02001402 kind = PyUnicode_4BYTE_KIND;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001403 char_size = 4;
1404 if (sizeof(wchar_t) == 4)
1405 is_sharing = 1;
1406 }
1407
1408 /* Ensure we won't overflow the size. */
1409 if (size < 0) {
1410 PyErr_SetString(PyExc_SystemError,
1411 "Negative size passed to PyUnicode_New");
1412 return NULL;
1413 }
1414 if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1415 return PyErr_NoMemory();
1416
1417 /* Duplicated allocation code from _PyObject_New() instead of a call to
1418 * PyObject_New() so we are able to allocate space for the object and
1419 * it's data buffer.
1420 */
1421 obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1422 if (obj == NULL)
1423 return PyErr_NoMemory();
1424 obj = PyObject_INIT(obj, &PyUnicode_Type);
1425 if (obj == NULL)
1426 return NULL;
1427
1428 unicode = (PyCompactUnicodeObject *)obj;
1429 if (is_ascii)
1430 data = ((PyASCIIObject*)obj) + 1;
1431 else
1432 data = unicode + 1;
1433 _PyUnicode_LENGTH(unicode) = size;
1434 _PyUnicode_HASH(unicode) = -1;
1435 _PyUnicode_STATE(unicode).interned = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001436 _PyUnicode_STATE(unicode).kind = kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001437 _PyUnicode_STATE(unicode).compact = 1;
1438 _PyUnicode_STATE(unicode).ready = 1;
1439 _PyUnicode_STATE(unicode).ascii = is_ascii;
1440 if (is_ascii) {
1441 ((char*)data)[size] = 0;
1442 _PyUnicode_WSTR(unicode) = NULL;
1443 }
Victor Stinner8f825062012-04-27 13:55:39 +02001444 else if (kind == PyUnicode_1BYTE_KIND) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001445 ((char*)data)[size] = 0;
1446 _PyUnicode_WSTR(unicode) = NULL;
1447 _PyUnicode_WSTR_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001448 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001449 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001450 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001451 else {
1452 unicode->utf8 = NULL;
Victor Stinner9e9d6892011-10-04 01:02:02 +02001453 unicode->utf8_length = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001454 if (kind == PyUnicode_2BYTE_KIND)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001455 ((Py_UCS2*)data)[size] = 0;
Victor Stinner8f825062012-04-27 13:55:39 +02001456 else /* kind == PyUnicode_4BYTE_KIND */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001457 ((Py_UCS4*)data)[size] = 0;
1458 if (is_sharing) {
1459 _PyUnicode_WSTR_LENGTH(unicode) = size;
1460 _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1461 }
1462 else {
1463 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1464 _PyUnicode_WSTR(unicode) = NULL;
1465 }
1466 }
Victor Stinner8f825062012-04-27 13:55:39 +02001467#ifdef Py_DEBUG
Victor Stinnerafffce42012-10-03 23:03:17 +02001468 unicode_fill_invalid((PyObject*)unicode, 0);
Victor Stinner8f825062012-04-27 13:55:39 +02001469#endif
Victor Stinner7931d9a2011-11-04 00:22:48 +01001470 assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001471 return obj;
1472}
1473
1474#if SIZEOF_WCHAR_T == 2
1475/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1476 will decode surrogate pairs, the other conversions are implemented as macros
Georg Brandl7597add2011-10-05 16:36:47 +02001477 for efficiency.
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001478
1479 This function assumes that unicode can hold one more code point than wstr
1480 characters for a terminating null character. */
Victor Stinnerc53be962011-10-02 21:33:54 +02001481static void
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001482unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
Victor Stinner9db1a8b2011-10-23 20:04:37 +02001483 PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001484{
1485 const wchar_t *iter;
1486 Py_UCS4 *ucs4_out;
1487
Victor Stinner910337b2011-10-03 03:20:16 +02001488 assert(unicode != NULL);
1489 assert(_PyUnicode_CHECK(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001490 assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1491 ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1492
1493 for (iter = begin; iter < end; ) {
1494 assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1495 _PyUnicode_GET_LENGTH(unicode)));
Victor Stinner551ac952011-11-29 22:58:13 +01001496 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1497 && (iter+1) < end
1498 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001499 {
Victor Stinner551ac952011-11-29 22:58:13 +01001500 *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001501 iter += 2;
1502 }
1503 else {
1504 *ucs4_out++ = *iter;
1505 iter++;
1506 }
1507 }
1508 assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1509 _PyUnicode_GET_LENGTH(unicode)));
1510
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001511}
1512#endif
1513
Victor Stinnercd9950f2011-10-02 00:34:53 +02001514static int
Victor Stinner488fa492011-12-12 00:01:39 +01001515unicode_check_modifiable(PyObject *unicode)
Victor Stinnercd9950f2011-10-02 00:34:53 +02001516{
Victor Stinner488fa492011-12-12 00:01:39 +01001517 if (!unicode_modifiable(unicode)) {
Victor Stinner01698042011-10-04 00:04:26 +02001518 PyErr_SetString(PyExc_SystemError,
Victor Stinner488fa492011-12-12 00:01:39 +01001519 "Cannot modify a string currently used");
Victor Stinnercd9950f2011-10-02 00:34:53 +02001520 return -1;
1521 }
Victor Stinnercd9950f2011-10-02 00:34:53 +02001522 return 0;
1523}
1524
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001525static int
1526_copy_characters(PyObject *to, Py_ssize_t to_start,
1527 PyObject *from, Py_ssize_t from_start,
1528 Py_ssize_t how_many, int check_maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001529{
Victor Stinnera0702ab2011-09-29 14:14:38 +02001530 unsigned int from_kind, to_kind;
1531 void *from_data, *to_data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001532
Victor Stinneree4544c2012-05-09 22:24:08 +02001533 assert(0 <= how_many);
1534 assert(0 <= from_start);
1535 assert(0 <= to_start);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001536 assert(PyUnicode_Check(from));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001537 assert(PyUnicode_IS_READY(from));
Victor Stinneree4544c2012-05-09 22:24:08 +02001538 assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001539
Victor Stinnerd3f08822012-05-29 12:57:52 +02001540 assert(PyUnicode_Check(to));
1541 assert(PyUnicode_IS_READY(to));
1542 assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1543
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001544 if (how_many == 0)
1545 return 0;
1546
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001547 from_kind = PyUnicode_KIND(from);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001548 from_data = PyUnicode_DATA(from);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001549 to_kind = PyUnicode_KIND(to);
Victor Stinnera0702ab2011-09-29 14:14:38 +02001550 to_data = PyUnicode_DATA(to);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001551
Victor Stinnerf1852262012-06-16 16:38:26 +02001552#ifdef Py_DEBUG
1553 if (!check_maxchar
1554 && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1555 {
1556 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1557 Py_UCS4 ch;
1558 Py_ssize_t i;
1559 for (i=0; i < how_many; i++) {
1560 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1561 assert(ch <= to_maxchar);
1562 }
1563 }
1564#endif
1565
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001566 if (from_kind == to_kind) {
Victor Stinnerf1852262012-06-16 16:38:26 +02001567 if (check_maxchar
1568 && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1569 {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001570 /* Writing Latin-1 characters into an ASCII string requires to
1571 check that all written characters are pure ASCII */
Victor Stinnerf1852262012-06-16 16:38:26 +02001572 Py_UCS4 max_char;
1573 max_char = ucs1lib_find_max_char(from_data,
1574 (Py_UCS1*)from_data + how_many);
1575 if (max_char >= 128)
1576 return -1;
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001577 }
Christian Heimesf051e432016-09-13 20:22:02 +02001578 memcpy((char*)to_data + to_kind * to_start,
Martin v. Löwisc47adb02011-10-07 20:55:35 +02001579 (char*)from_data + from_kind * from_start,
1580 to_kind * how_many);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001581 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001582 else if (from_kind == PyUnicode_1BYTE_KIND
1583 && to_kind == PyUnicode_2BYTE_KIND)
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001584 {
1585 _PyUnicode_CONVERT_BYTES(
1586 Py_UCS1, Py_UCS2,
1587 PyUnicode_1BYTE_DATA(from) + from_start,
1588 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1589 PyUnicode_2BYTE_DATA(to) + to_start
1590 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001591 }
Victor Stinner157f83f2011-09-28 21:41:31 +02001592 else if (from_kind == PyUnicode_1BYTE_KIND
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001593 && to_kind == PyUnicode_4BYTE_KIND)
1594 {
1595 _PyUnicode_CONVERT_BYTES(
1596 Py_UCS1, Py_UCS4,
1597 PyUnicode_1BYTE_DATA(from) + from_start,
1598 PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1599 PyUnicode_4BYTE_DATA(to) + to_start
1600 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001601 }
1602 else if (from_kind == PyUnicode_2BYTE_KIND
1603 && to_kind == PyUnicode_4BYTE_KIND)
1604 {
1605 _PyUnicode_CONVERT_BYTES(
1606 Py_UCS2, Py_UCS4,
1607 PyUnicode_2BYTE_DATA(from) + from_start,
1608 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1609 PyUnicode_4BYTE_DATA(to) + to_start
1610 );
Victor Stinnerbe78eaf2011-09-28 21:37:03 +02001611 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001612 else {
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001613 assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1614
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001615 if (!check_maxchar) {
1616 if (from_kind == PyUnicode_2BYTE_KIND
1617 && to_kind == PyUnicode_1BYTE_KIND)
1618 {
1619 _PyUnicode_CONVERT_BYTES(
1620 Py_UCS2, Py_UCS1,
1621 PyUnicode_2BYTE_DATA(from) + from_start,
1622 PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1623 PyUnicode_1BYTE_DATA(to) + to_start
1624 );
1625 }
1626 else if (from_kind == PyUnicode_4BYTE_KIND
1627 && to_kind == PyUnicode_1BYTE_KIND)
1628 {
1629 _PyUnicode_CONVERT_BYTES(
1630 Py_UCS4, Py_UCS1,
1631 PyUnicode_4BYTE_DATA(from) + from_start,
1632 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1633 PyUnicode_1BYTE_DATA(to) + to_start
1634 );
1635 }
1636 else if (from_kind == PyUnicode_4BYTE_KIND
1637 && to_kind == PyUnicode_2BYTE_KIND)
1638 {
1639 _PyUnicode_CONVERT_BYTES(
1640 Py_UCS4, Py_UCS2,
1641 PyUnicode_4BYTE_DATA(from) + from_start,
1642 PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1643 PyUnicode_2BYTE_DATA(to) + to_start
1644 );
1645 }
1646 else {
Barry Warsawb2e57942017-09-14 18:13:16 -07001647 Py_UNREACHABLE();
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001648 }
1649 }
Victor Stinnerf1852262012-06-16 16:38:26 +02001650 else {
Victor Stinnera0702ab2011-09-29 14:14:38 +02001651 const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001652 Py_UCS4 ch;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001653 Py_ssize_t i;
1654
Victor Stinnera0702ab2011-09-29 14:14:38 +02001655 for (i=0; i < how_many; i++) {
1656 ch = PyUnicode_READ(from_kind, from_data, from_start + i);
Victor Stinnerc9d369f2012-06-16 02:22:37 +02001657 if (ch > to_maxchar)
1658 return -1;
Victor Stinnera0702ab2011-09-29 14:14:38 +02001659 PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1660 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001661 }
1662 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001663 return 0;
1664}
1665
Victor Stinnerd3f08822012-05-29 12:57:52 +02001666void
1667_PyUnicode_FastCopyCharacters(
1668 PyObject *to, Py_ssize_t to_start,
1669 PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001670{
1671 (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1672}
1673
1674Py_ssize_t
1675PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1676 PyObject *from, Py_ssize_t from_start,
1677 Py_ssize_t how_many)
1678{
1679 int err;
1680
1681 if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1682 PyErr_BadInternalCall();
1683 return -1;
1684 }
1685
Benjamin Petersonbac79492012-01-14 13:34:47 -05001686 if (PyUnicode_READY(from) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001687 return -1;
Benjamin Petersonbac79492012-01-14 13:34:47 -05001688 if (PyUnicode_READY(to) == -1)
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001689 return -1;
1690
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001691 if ((size_t)from_start > (size_t)PyUnicode_GET_LENGTH(from)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001692 PyErr_SetString(PyExc_IndexError, "string index out of range");
1693 return -1;
1694 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001695 if ((size_t)to_start > (size_t)PyUnicode_GET_LENGTH(to)) {
Victor Stinnerd3f08822012-05-29 12:57:52 +02001696 PyErr_SetString(PyExc_IndexError, "string index out of range");
1697 return -1;
1698 }
Serhiy Storchaka9c0e1f82016-10-08 22:45:38 +03001699 if (how_many < 0) {
1700 PyErr_SetString(PyExc_SystemError, "how_many cannot be negative");
1701 return -1;
1702 }
1703 how_many = Py_MIN(PyUnicode_GET_LENGTH(from)-from_start, how_many);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001704 if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1705 PyErr_Format(PyExc_SystemError,
Victor Stinnera33bce02014-07-04 22:47:46 +02001706 "Cannot write %zi characters at %zi "
1707 "in a string of %zi characters",
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001708 how_many, to_start, PyUnicode_GET_LENGTH(to));
1709 return -1;
1710 }
1711
1712 if (how_many == 0)
1713 return 0;
1714
Victor Stinner488fa492011-12-12 00:01:39 +01001715 if (unicode_check_modifiable(to))
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001716 return -1;
1717
1718 err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1719 if (err) {
1720 PyErr_Format(PyExc_SystemError,
1721 "Cannot copy %s characters "
1722 "into a string of %s characters",
1723 unicode_kind_name(from),
1724 unicode_kind_name(to));
1725 return -1;
1726 }
Victor Stinnera0702ab2011-09-29 14:14:38 +02001727 return how_many;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001728}
1729
Victor Stinner17222162011-09-28 22:15:37 +02001730/* Find the maximum code point and count the number of surrogate pairs so a
1731 correct string length can be computed before converting a string to UCS4.
1732 This function counts single surrogates as a character and not as a pair.
1733
1734 Return 0 on success, or -1 on error. */
1735static int
1736find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1737 Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001738{
1739 const wchar_t *iter;
Victor Stinner8faf8212011-12-08 22:14:11 +01001740 Py_UCS4 ch;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001741
Victor Stinnerc53be962011-10-02 21:33:54 +02001742 assert(num_surrogates != NULL && maxchar != NULL);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001743 *num_surrogates = 0;
1744 *maxchar = 0;
1745
1746 for (iter = begin; iter < end; ) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001747#if SIZEOF_WCHAR_T == 2
Victor Stinnercf77da92013-03-06 01:09:24 +01001748 if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1749 && (iter+1) < end
1750 && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1751 {
1752 ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1753 ++(*num_surrogates);
1754 iter += 2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001755 }
1756 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001757#endif
Victor Stinner8faf8212011-12-08 22:14:11 +01001758 {
1759 ch = *iter;
1760 iter++;
1761 }
1762 if (ch > *maxchar) {
1763 *maxchar = ch;
1764 if (*maxchar > MAX_UNICODE) {
1765 PyErr_Format(PyExc_ValueError,
1766 "character U+%x is not in range [U+0000; U+10ffff]",
1767 ch);
1768 return -1;
1769 }
1770 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001771 }
1772 return 0;
1773}
1774
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01001775int
1776_PyUnicode_Ready(PyObject *unicode)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001777{
1778 wchar_t *end;
1779 Py_UCS4 maxchar = 0;
1780 Py_ssize_t num_surrogates;
1781#if SIZEOF_WCHAR_T == 2
1782 Py_ssize_t length_wo_surrogates;
1783#endif
1784
Georg Brandl7597add2011-10-05 16:36:47 +02001785 /* _PyUnicode_Ready() is only intended for old-style API usage where
Victor Stinnerd8f65102011-09-29 19:43:17 +02001786 strings were created using _PyObject_New() and where no canonical
1787 representation (the str field) has been set yet aka strings
1788 which are not yet ready. */
Victor Stinner910337b2011-10-03 03:20:16 +02001789 assert(_PyUnicode_CHECK(unicode));
1790 assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001791 assert(_PyUnicode_WSTR(unicode) != NULL);
Victor Stinnerc3c74152011-10-02 20:39:55 +02001792 assert(_PyUnicode_DATA_ANY(unicode) == NULL);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001793 assert(_PyUnicode_UTF8(unicode) == NULL);
Victor Stinnerd8f65102011-09-29 19:43:17 +02001794 /* Actually, it should neither be interned nor be anything else: */
1795 assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001796
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001797 end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinner17222162011-09-28 22:15:37 +02001798 if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
Victor Stinnerd8f65102011-09-29 19:43:17 +02001799 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001800 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001801
1802 if (maxchar < 256) {
Victor Stinnerc3c74152011-10-02 20:39:55 +02001803 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1804 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001805 PyErr_NoMemory();
1806 return -1;
1807 }
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02001808 _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001809 _PyUnicode_WSTR(unicode), end,
1810 PyUnicode_1BYTE_DATA(unicode));
1811 PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1812 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1813 _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1814 if (maxchar < 128) {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001815 _PyUnicode_STATE(unicode).ascii = 1;
Victor Stinnerc3c74152011-10-02 20:39:55 +02001816 _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001817 _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001818 }
1819 else {
Victor Stinnera3b334d2011-10-03 13:53:37 +02001820 _PyUnicode_STATE(unicode).ascii = 0;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001821 _PyUnicode_UTF8(unicode) = NULL;
1822 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001823 }
1824 PyObject_FREE(_PyUnicode_WSTR(unicode));
1825 _PyUnicode_WSTR(unicode) = NULL;
1826 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1827 }
1828 /* In this case we might have to convert down from 4-byte native
1829 wchar_t to 2-byte unicode. */
1830 else if (maxchar < 65536) {
1831 assert(num_surrogates == 0 &&
1832 "FindMaxCharAndNumSurrogatePairs() messed up");
1833
Victor Stinner506f5922011-09-28 22:34:18 +02001834#if SIZEOF_WCHAR_T == 2
1835 /* We can share representations and are done. */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001836 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Victor Stinner506f5922011-09-28 22:34:18 +02001837 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1838 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1839 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001840 _PyUnicode_UTF8(unicode) = NULL;
1841 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001842#else
1843 /* sizeof(wchar_t) == 4 */
Victor Stinnerc3c74152011-10-02 20:39:55 +02001844 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
Victor Stinner506f5922011-09-28 22:34:18 +02001845 2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
Victor Stinnerc3c74152011-10-02 20:39:55 +02001846 if (!_PyUnicode_DATA_ANY(unicode)) {
Victor Stinner506f5922011-09-28 22:34:18 +02001847 PyErr_NoMemory();
1848 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001849 }
Victor Stinner506f5922011-09-28 22:34:18 +02001850 _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1851 _PyUnicode_WSTR(unicode), end,
1852 PyUnicode_2BYTE_DATA(unicode));
1853 PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1854 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1855 _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001856 _PyUnicode_UTF8(unicode) = NULL;
1857 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner506f5922011-09-28 22:34:18 +02001858 PyObject_FREE(_PyUnicode_WSTR(unicode));
1859 _PyUnicode_WSTR(unicode) = NULL;
1860 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1861#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001862 }
1863 /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1864 else {
1865#if SIZEOF_WCHAR_T == 2
1866 /* in case the native representation is 2-bytes, we need to allocate a
1867 new normalized 4-byte version. */
1868 length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
Serhiy Storchakae55181f2015-02-20 21:34:06 +02001869 if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1870 PyErr_NoMemory();
1871 return -1;
1872 }
Victor Stinnerc3c74152011-10-02 20:39:55 +02001873 _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1874 if (!_PyUnicode_DATA_ANY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001875 PyErr_NoMemory();
1876 return -1;
1877 }
1878 _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1879 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001880 _PyUnicode_UTF8(unicode) = NULL;
1881 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Victor Stinner126c5592011-10-03 04:17:10 +02001882 /* unicode_convert_wchar_to_ucs4() requires a ready string */
1883 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerc53be962011-10-02 21:33:54 +02001884 unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001885 PyObject_FREE(_PyUnicode_WSTR(unicode));
1886 _PyUnicode_WSTR(unicode) = NULL;
1887 _PyUnicode_WSTR_LENGTH(unicode) = 0;
1888#else
1889 assert(num_surrogates == 0);
1890
Victor Stinnerc3c74152011-10-02 20:39:55 +02001891 _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001892 _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001893 _PyUnicode_UTF8(unicode) = NULL;
1894 _PyUnicode_UTF8_LENGTH(unicode) = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001895 _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1896#endif
1897 PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1898 }
1899 _PyUnicode_STATE(unicode).ready = 1;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02001900 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001901 return 0;
1902}
1903
Alexander Belopolsky40018472011-02-26 01:02:56 +00001904static void
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02001905unicode_dealloc(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00001906{
Walter Dörwald16807132007-05-25 13:52:07 +00001907 switch (PyUnicode_CHECK_INTERNED(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00001908 case SSTATE_NOT_INTERNED:
1909 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001910
Benjamin Peterson29060642009-01-31 22:14:21 +00001911 case SSTATE_INTERNED_MORTAL:
1912 /* revive dead object temporarily for DelItem */
Victor Stinnerc86a1122020-02-07 01:24:29 +01001913 Py_SET_REFCNT(unicode, 3);
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001914 if (PyDict_DelItem(interned, unicode) != 0) {
1915 _PyErr_WriteUnraisableMsg("deletion of interned string failed",
1916 NULL);
1917 }
Benjamin Peterson29060642009-01-31 22:14:21 +00001918 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001919
Benjamin Peterson29060642009-01-31 22:14:21 +00001920 case SSTATE_INTERNED_IMMORTAL:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001921 _PyObject_ASSERT_FAILED_MSG(unicode, "Immortal interned string died");
1922 break;
Walter Dörwald16807132007-05-25 13:52:07 +00001923
Benjamin Peterson29060642009-01-31 22:14:21 +00001924 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001925 Py_UNREACHABLE();
Walter Dörwald16807132007-05-25 13:52:07 +00001926 }
1927
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001928 if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001929 PyObject_DEL(_PyUnicode_WSTR(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001930 }
1931 if (_PyUnicode_HAS_UTF8_MEMORY(unicode)) {
Victor Stinnere90fe6a2011-10-01 16:48:13 +02001932 PyObject_DEL(_PyUnicode_UTF8(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001933 }
1934 if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001935 PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
Victor Stinnerec3c99c2020-01-30 12:18:32 +01001936 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02001937
Victor Stinnerb0a82a62011-12-12 13:08:33 +01001938 Py_TYPE(unicode)->tp_free(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00001939}
1940
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001941#ifdef Py_DEBUG
1942static int
1943unicode_is_singleton(PyObject *unicode)
1944{
1945 PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1946 if (unicode == unicode_empty)
1947 return 1;
1948 if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1949 {
1950 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1951 if (ch < 256 && unicode_latin1[ch] == unicode)
1952 return 1;
1953 }
1954 return 0;
1955}
1956#endif
1957
Alexander Belopolsky40018472011-02-26 01:02:56 +00001958static int
Victor Stinner488fa492011-12-12 00:01:39 +01001959unicode_modifiable(PyObject *unicode)
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001960{
Victor Stinner488fa492011-12-12 00:01:39 +01001961 assert(_PyUnicode_CHECK(unicode));
Victor Stinnerfe226c02011-10-03 03:52:20 +02001962 if (Py_REFCNT(unicode) != 1)
1963 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001964 if (_PyUnicode_HASH(unicode) != -1)
1965 return 0;
Victor Stinnerfe226c02011-10-03 03:52:20 +02001966 if (PyUnicode_CHECK_INTERNED(unicode))
1967 return 0;
Victor Stinner488fa492011-12-12 00:01:39 +01001968 if (!PyUnicode_CheckExact(unicode))
1969 return 0;
Victor Stinner77bb47b2011-10-03 20:06:05 +02001970#ifdef Py_DEBUG
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +02001971 /* singleton refcount is greater than 1 */
1972 assert(!unicode_is_singleton(unicode));
Victor Stinner77bb47b2011-10-03 20:06:05 +02001973#endif
Victor Stinnerfe226c02011-10-03 03:52:20 +02001974 return 1;
1975}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00001976
Victor Stinnerfe226c02011-10-03 03:52:20 +02001977static int
1978unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1979{
1980 PyObject *unicode;
1981 Py_ssize_t old_length;
1982
1983 assert(p_unicode != NULL);
1984 unicode = *p_unicode;
1985
1986 assert(unicode != NULL);
1987 assert(PyUnicode_Check(unicode));
1988 assert(0 <= length);
1989
Victor Stinner910337b2011-10-03 03:20:16 +02001990 if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
Victor Stinnerfe226c02011-10-03 03:52:20 +02001991 old_length = PyUnicode_WSTR_LENGTH(unicode);
1992 else
1993 old_length = PyUnicode_GET_LENGTH(unicode);
1994 if (old_length == length)
1995 return 0;
1996
Martin v. Löwise9b11c12011-11-08 17:35:34 +01001997 if (length == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +02001998 _Py_INCREF_UNICODE_EMPTY();
1999 if (!unicode_empty)
Benjamin Peterson29060642009-01-31 22:14:21 +00002000 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002001 Py_SETREF(*p_unicode, unicode_empty);
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002002 return 0;
2003 }
2004
Victor Stinner488fa492011-12-12 00:01:39 +01002005 if (!unicode_modifiable(unicode)) {
Victor Stinnerfe226c02011-10-03 03:52:20 +02002006 PyObject *copy = resize_copy(unicode, length);
2007 if (copy == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00002008 return -1;
Serhiy Storchaka57a01d32016-04-10 18:05:40 +03002009 Py_SETREF(*p_unicode, copy);
Benjamin Peterson29060642009-01-31 22:14:21 +00002010 return 0;
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002011 }
2012
Victor Stinnerfe226c02011-10-03 03:52:20 +02002013 if (PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002014 PyObject *new_unicode = resize_compact(unicode, length);
2015 if (new_unicode == NULL)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002016 return -1;
Victor Stinnerb0a82a62011-12-12 13:08:33 +01002017 *p_unicode = new_unicode;
Victor Stinnerfe226c02011-10-03 03:52:20 +02002018 return 0;
Benjamin Peterson4bfce8f2011-10-03 19:35:07 -04002019 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002020 return resize_inplace(unicode, length);
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002021}
2022
Alexander Belopolsky40018472011-02-26 01:02:56 +00002023int
Victor Stinnerfe226c02011-10-03 03:52:20 +02002024PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002025{
Victor Stinnerfe226c02011-10-03 03:52:20 +02002026 PyObject *unicode;
2027 if (p_unicode == NULL) {
2028 PyErr_BadInternalCall();
2029 return -1;
2030 }
2031 unicode = *p_unicode;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01002032 if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
Victor Stinnerfe226c02011-10-03 03:52:20 +02002033 {
2034 PyErr_BadInternalCall();
2035 return -1;
2036 }
2037 return unicode_resize(p_unicode, length);
Alexandre Vassalottiaa0e5312008-12-27 06:43:58 +00002038}
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002039
Serhiy Storchakad65c9492015-11-02 14:10:23 +02002040/* Copy an ASCII or latin1 char* string into a Python Unicode string.
Victor Stinnerc5166102012-02-22 13:55:02 +01002041
Victor Stinnerb429d3b2012-02-22 21:22:20 +01002042 WARNING: The function doesn't copy the terminating null character and
2043 doesn't check the maximum character (may write a latin1 character in an
2044 ASCII string). */
Victor Stinner184252a2012-06-16 02:57:41 +02002045static void
2046unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
2047 const char *str, Py_ssize_t len)
Victor Stinnerc5166102012-02-22 13:55:02 +01002048{
2049 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
2050 void *data = PyUnicode_DATA(unicode);
Victor Stinner184252a2012-06-16 02:57:41 +02002051 const char *end = str + len;
Victor Stinnerc5166102012-02-22 13:55:02 +01002052
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002053 assert(index + len <= PyUnicode_GET_LENGTH(unicode));
Victor Stinnerc5166102012-02-22 13:55:02 +01002054 switch (kind) {
2055 case PyUnicode_1BYTE_KIND: {
Victor Stinner8c6db452012-10-06 00:40:45 +02002056#ifdef Py_DEBUG
2057 if (PyUnicode_IS_ASCII(unicode)) {
2058 Py_UCS4 maxchar = ucs1lib_find_max_char(
2059 (const Py_UCS1*)str,
2060 (const Py_UCS1*)str + len);
2061 assert(maxchar < 128);
2062 }
2063#endif
Antoine Pitrouba6bafc2012-02-22 16:41:50 +01002064 memcpy((char *) data + index, str, len);
Victor Stinner184252a2012-06-16 02:57:41 +02002065 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002066 }
2067 case PyUnicode_2BYTE_KIND: {
2068 Py_UCS2 *start = (Py_UCS2 *)data + index;
2069 Py_UCS2 *ucs2 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002070
Victor Stinner184252a2012-06-16 02:57:41 +02002071 for (; str < end; ++ucs2, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002072 *ucs2 = (Py_UCS2)*str;
2073
2074 assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner184252a2012-06-16 02:57:41 +02002075 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002076 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002077 case PyUnicode_4BYTE_KIND: {
Victor Stinnerc5166102012-02-22 13:55:02 +01002078 Py_UCS4 *start = (Py_UCS4 *)data + index;
2079 Py_UCS4 *ucs4 = start;
Victor Stinnerc5166102012-02-22 13:55:02 +01002080
Victor Stinner184252a2012-06-16 02:57:41 +02002081 for (; str < end; ++ucs4, ++str)
Victor Stinnerc5166102012-02-22 13:55:02 +01002082 *ucs4 = (Py_UCS4)*str;
2083
2084 assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002085 break;
Victor Stinnerc5166102012-02-22 13:55:02 +01002086 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002087 default:
2088 Py_UNREACHABLE();
Victor Stinnerc5166102012-02-22 13:55:02 +01002089 }
2090}
2091
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002092static PyObject*
2093get_latin1_char(unsigned char ch)
2094{
Victor Stinnera464fc12011-10-02 20:39:30 +02002095 PyObject *unicode = unicode_latin1[ch];
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002096 if (!unicode) {
Victor Stinnera464fc12011-10-02 20:39:30 +02002097 unicode = PyUnicode_New(1, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002098 if (!unicode)
2099 return NULL;
2100 PyUnicode_1BYTE_DATA(unicode)[0] = ch;
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002101 assert(_PyUnicode_CheckConsistency(unicode, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002102 unicode_latin1[ch] = unicode;
2103 }
2104 Py_INCREF(unicode);
Victor Stinnera464fc12011-10-02 20:39:30 +02002105 return unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002106}
2107
Victor Stinner985a82a2014-01-03 12:53:47 +01002108static PyObject*
2109unicode_char(Py_UCS4 ch)
2110{
2111 PyObject *unicode;
2112
2113 assert(ch <= MAX_UNICODE);
2114
Victor Stinnerf3b46b42014-01-03 13:16:00 +01002115 if (ch < 256)
2116 return get_latin1_char(ch);
2117
Victor Stinner985a82a2014-01-03 12:53:47 +01002118 unicode = PyUnicode_New(1, ch);
2119 if (unicode == NULL)
2120 return NULL;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002121
2122 assert(PyUnicode_KIND(unicode) != PyUnicode_1BYTE_KIND);
2123 if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
Victor Stinner985a82a2014-01-03 12:53:47 +01002124 PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
Serhiy Storchaka2e58f1a2016-10-09 23:44:48 +03002125 } else {
Victor Stinner985a82a2014-01-03 12:53:47 +01002126 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
2127 PyUnicode_4BYTE_DATA(unicode)[0] = ch;
2128 }
2129 assert(_PyUnicode_CheckConsistency(unicode, 1));
2130 return unicode;
2131}
2132
Alexander Belopolsky40018472011-02-26 01:02:56 +00002133PyObject *
2134PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00002135{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002136 if (u == NULL)
2137 return (PyObject*)_PyUnicode_New(size);
2138
2139 if (size < 0) {
2140 PyErr_BadInternalCall();
2141 return NULL;
2142 }
2143
2144 return PyUnicode_FromWideChar(u, size);
2145}
2146
2147PyObject *
2148PyUnicode_FromWideChar(const wchar_t *u, Py_ssize_t size)
2149{
Victor Stinner9db1a8b2011-10-23 20:04:37 +02002150 PyObject *unicode;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002151 Py_UCS4 maxchar = 0;
2152 Py_ssize_t num_surrogates;
2153
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02002154 if (u == NULL && size != 0) {
2155 PyErr_BadInternalCall();
2156 return NULL;
2157 }
2158
2159 if (size == -1) {
2160 size = wcslen(u);
2161 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002162
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002163 /* If the Unicode data is known at construction time, we can apply
2164 some optimizations which share commonly used objects. */
Marc-André Lemburg8155e0e2001-04-23 14:44:21 +00002165
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002166 /* Optimization for empty strings */
Serhiy Storchaka678db842013-01-26 12:16:36 +02002167 if (size == 0)
2168 _Py_RETURN_UNICODE_EMPTY();
Tim Petersced69f82003-09-16 20:30:58 +00002169
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002170 /* Single character Unicode objects in the Latin-1 range are
2171 shared when using this constructor */
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002172 if (size == 1 && (Py_UCS4)*u < 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002173 return get_latin1_char((unsigned char)*u);
2174
2175 /* If not empty and not single character, copy the Unicode data
2176 into the new object */
Victor Stinnerd8f65102011-09-29 19:43:17 +02002177 if (find_maxchar_surrogates(u, u + size,
2178 &maxchar, &num_surrogates) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002179 return NULL;
2180
Victor Stinner8faf8212011-12-08 22:14:11 +01002181 unicode = PyUnicode_New(size - num_surrogates, maxchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002182 if (!unicode)
2183 return NULL;
2184
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002185 switch (PyUnicode_KIND(unicode)) {
2186 case PyUnicode_1BYTE_KIND:
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002187 _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002188 u, u + size, PyUnicode_1BYTE_DATA(unicode));
2189 break;
2190 case PyUnicode_2BYTE_KIND:
2191#if Py_UNICODE_SIZE == 2
Christian Heimesf051e432016-09-13 20:22:02 +02002192 memcpy(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002193#else
Victor Stinnerfb5f5f22011-09-28 21:39:49 +02002194 _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002195 u, u + size, PyUnicode_2BYTE_DATA(unicode));
2196#endif
2197 break;
2198 case PyUnicode_4BYTE_KIND:
2199#if SIZEOF_WCHAR_T == 2
2200 /* This is the only case which has to process surrogates, thus
2201 a simple copy loop is not enough and we need a function. */
Victor Stinnerc53be962011-10-02 21:33:54 +02002202 unicode_convert_wchar_to_ucs4(u, u + size, unicode);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002203#else
2204 assert(num_surrogates == 0);
Christian Heimesf051e432016-09-13 20:22:02 +02002205 memcpy(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002206#endif
2207 break;
2208 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002209 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002210 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00002211
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002212 return unicode_result(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00002213}
2214
Alexander Belopolsky40018472011-02-26 01:02:56 +00002215PyObject *
2216PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002217{
Benjamin Peterson14339b62009-01-31 16:36:08 +00002218 if (size < 0) {
2219 PyErr_SetString(PyExc_SystemError,
Benjamin Peterson29060642009-01-31 22:14:21 +00002220 "Negative size passed to PyUnicode_FromStringAndSize");
Benjamin Peterson14339b62009-01-31 16:36:08 +00002221 return NULL;
2222 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002223 if (u != NULL)
2224 return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
2225 else
2226 return (PyObject *)_PyUnicode_New(size);
Walter Dörwaldacaa5a12007-05-05 12:00:46 +00002227}
2228
Alexander Belopolsky40018472011-02-26 01:02:56 +00002229PyObject *
2230PyUnicode_FromString(const char *u)
Walter Dörwaldd2034312007-05-18 16:29:38 +00002231{
2232 size_t size = strlen(u);
2233 if (size > PY_SSIZE_T_MAX) {
2234 PyErr_SetString(PyExc_OverflowError, "input too long");
2235 return NULL;
2236 }
Victor Stinnera1d12bb2011-12-11 21:53:09 +01002237 return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
Walter Dörwaldd2034312007-05-18 16:29:38 +00002238}
2239
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002240PyObject *
2241_PyUnicode_FromId(_Py_Identifier *id)
2242{
2243 if (!id->object) {
Victor Stinnerd1cd99b2012-02-07 23:05:55 +01002244 id->object = PyUnicode_DecodeUTF8Stateful(id->string,
2245 strlen(id->string),
2246 NULL, NULL);
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002247 if (!id->object)
2248 return NULL;
2249 PyUnicode_InternInPlace(&id->object);
2250 assert(!id->next);
2251 id->next = static_strings;
2252 static_strings = id;
2253 }
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002254 return id->object;
2255}
2256
2257void
2258_PyUnicode_ClearStaticStrings()
2259{
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002260 _Py_Identifier *tmp, *s = static_strings;
2261 while (s) {
Serhiy Storchaka505ff752014-02-09 13:33:53 +02002262 Py_CLEAR(s->object);
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002263 tmp = s->next;
2264 s->next = NULL;
2265 s = tmp;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002266 }
Benjamin Peterson0c270a82013-01-09 09:52:01 -06002267 static_strings = NULL;
Martin v. Löwisafe55bb2011-10-09 10:38:36 +02002268}
2269
Benjamin Peterson0df54292012-03-26 14:50:32 -04002270/* Internal function, doesn't check maximum character */
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002271
Victor Stinnerd3f08822012-05-29 12:57:52 +02002272PyObject*
2273_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
Victor Stinner702c7342011-10-05 13:50:52 +02002274{
Victor Stinnerd3f08822012-05-29 12:57:52 +02002275 const unsigned char *s = (const unsigned char *)buffer;
Victor Stinner785938e2011-12-11 20:09:03 +01002276 PyObject *unicode;
Victor Stinnere6b2d442011-12-11 21:54:30 +01002277 if (size == 1) {
Victor Stinner0617b6e2011-10-05 23:26:01 +02002278#ifdef Py_DEBUG
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002279 assert((unsigned char)s[0] < 128);
Victor Stinner0617b6e2011-10-05 23:26:01 +02002280#endif
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002281 return get_latin1_char(s[0]);
Victor Stinnere6b2d442011-12-11 21:54:30 +01002282 }
Victor Stinner785938e2011-12-11 20:09:03 +01002283 unicode = PyUnicode_New(size, 127);
2284 if (!unicode)
Victor Stinner702c7342011-10-05 13:50:52 +02002285 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01002286 memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
2287 assert(_PyUnicode_CheckConsistency(unicode, 1));
2288 return unicode;
Victor Stinner702c7342011-10-05 13:50:52 +02002289}
2290
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002291static Py_UCS4
2292kind_maxchar_limit(unsigned int kind)
2293{
Benjamin Petersonead6b532011-12-20 17:23:42 -06002294 switch (kind) {
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002295 case PyUnicode_1BYTE_KIND:
2296 return 0x80;
2297 case PyUnicode_2BYTE_KIND:
2298 return 0x100;
2299 case PyUnicode_4BYTE_KIND:
2300 return 0x10000;
2301 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002302 Py_UNREACHABLE();
Victor Stinnerc80d6d22011-10-05 14:13:28 +02002303 }
2304}
2305
Victor Stinner702c7342011-10-05 13:50:52 +02002306static PyObject*
Victor Stinnerd21b58c2013-02-26 00:15:54 +01002307_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
Mark Dickinson081dfee2009-03-18 14:47:41 +00002308{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002309 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002310 unsigned char max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002311
Serhiy Storchaka678db842013-01-26 12:16:36 +02002312 if (size == 0)
2313 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002314 assert(size > 0);
Antoine Pitrou7c46da72011-10-06 22:07:51 +02002315 if (size == 1)
2316 return get_latin1_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002317
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002318 max_char = ucs1lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002319 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002320 if (!res)
2321 return NULL;
2322 memcpy(PyUnicode_1BYTE_DATA(res), u, size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002323 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002324 return res;
Mark Dickinson081dfee2009-03-18 14:47:41 +00002325}
2326
Victor Stinnere57b1c02011-09-28 22:20:48 +02002327static PyObject*
2328_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002329{
2330 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002331 Py_UCS2 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002332
Serhiy Storchaka678db842013-01-26 12:16:36 +02002333 if (size == 0)
2334 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002335 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002336 if (size == 1)
2337 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002338
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002339 max_char = ucs2lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002340 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002341 if (!res)
2342 return NULL;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002343 if (max_char >= 256)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002344 memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002345 else {
2346 _PyUnicode_CONVERT_BYTES(
2347 Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2348 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002349 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002350 return res;
2351}
2352
Victor Stinnere57b1c02011-09-28 22:20:48 +02002353static PyObject*
2354_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002355{
2356 PyObject *res;
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002357 Py_UCS4 max_char;
Victor Stinnerb9275c12011-10-05 14:01:42 +02002358
Serhiy Storchaka678db842013-01-26 12:16:36 +02002359 if (size == 0)
2360 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002361 assert(size > 0);
Victor Stinner985a82a2014-01-03 12:53:47 +01002362 if (size == 1)
2363 return unicode_char(u[0]);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01002364
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002365 max_char = ucs4lib_find_max_char(u, u + size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002366 res = PyUnicode_New(size, max_char);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002367 if (!res)
2368 return NULL;
Antoine Pitrou950468e2011-10-11 22:45:48 +02002369 if (max_char < 256)
2370 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2371 PyUnicode_1BYTE_DATA(res));
2372 else if (max_char < 0x10000)
2373 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2374 PyUnicode_2BYTE_DATA(res));
2375 else
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002376 memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002377 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002378 return res;
2379}
2380
2381PyObject*
2382PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2383{
Victor Stinnercfed46e2011-11-22 01:29:14 +01002384 if (size < 0) {
2385 PyErr_SetString(PyExc_ValueError, "size must be positive");
2386 return NULL;
2387 }
Benjamin Petersonead6b532011-12-20 17:23:42 -06002388 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002389 case PyUnicode_1BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002390 return _PyUnicode_FromUCS1(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002391 case PyUnicode_2BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002392 return _PyUnicode_FromUCS2(buffer, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002393 case PyUnicode_4BYTE_KIND:
Victor Stinnere57b1c02011-09-28 22:20:48 +02002394 return _PyUnicode_FromUCS4(buffer, size);
Victor Stinnerb9275c12011-10-05 14:01:42 +02002395 default:
Victor Stinnerb9275c12011-10-05 14:01:42 +02002396 PyErr_SetString(PyExc_SystemError, "invalid kind");
2397 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002398 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002399}
2400
Victor Stinnerece58de2012-04-23 23:36:38 +02002401Py_UCS4
2402_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2403{
2404 enum PyUnicode_Kind kind;
2405 void *startptr, *endptr;
2406
2407 assert(PyUnicode_IS_READY(unicode));
2408 assert(0 <= start);
2409 assert(end <= PyUnicode_GET_LENGTH(unicode));
2410 assert(start <= end);
2411
2412 if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2413 return PyUnicode_MAX_CHAR_VALUE(unicode);
2414
2415 if (start == end)
2416 return 127;
2417
Victor Stinner94d558b2012-04-27 22:26:58 +02002418 if (PyUnicode_IS_ASCII(unicode))
2419 return 127;
2420
Victor Stinnerece58de2012-04-23 23:36:38 +02002421 kind = PyUnicode_KIND(unicode);
Benjamin Petersonf3b7d862012-04-23 18:07:01 -04002422 startptr = PyUnicode_DATA(unicode);
Benjamin Petersonb9f4c9d2012-04-23 21:45:40 -04002423 endptr = (char *)startptr + end * kind;
2424 startptr = (char *)startptr + start * kind;
Benjamin Peterson2844a7a2012-04-23 18:00:25 -04002425 switch(kind) {
2426 case PyUnicode_1BYTE_KIND:
2427 return ucs1lib_find_max_char(startptr, endptr);
2428 case PyUnicode_2BYTE_KIND:
2429 return ucs2lib_find_max_char(startptr, endptr);
2430 case PyUnicode_4BYTE_KIND:
2431 return ucs4lib_find_max_char(startptr, endptr);
Victor Stinnerece58de2012-04-23 23:36:38 +02002432 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07002433 Py_UNREACHABLE();
Victor Stinnerece58de2012-04-23 23:36:38 +02002434 }
2435}
2436
Victor Stinner25a4b292011-10-06 12:31:55 +02002437/* Ensure that a string uses the most efficient storage, if it is not the
2438 case: create a new string with of the right kind. Write NULL into *p_unicode
2439 on error. */
Antoine Pitrou53bb5482011-10-10 23:49:24 +02002440static void
Victor Stinner25a4b292011-10-06 12:31:55 +02002441unicode_adjust_maxchar(PyObject **p_unicode)
2442{
2443 PyObject *unicode, *copy;
2444 Py_UCS4 max_char;
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002445 Py_ssize_t len;
Victor Stinner25a4b292011-10-06 12:31:55 +02002446 unsigned int kind;
2447
2448 assert(p_unicode != NULL);
2449 unicode = *p_unicode;
2450 assert(PyUnicode_IS_READY(unicode));
2451 if (PyUnicode_IS_ASCII(unicode))
2452 return;
2453
2454 len = PyUnicode_GET_LENGTH(unicode);
2455 kind = PyUnicode_KIND(unicode);
2456 if (kind == PyUnicode_1BYTE_KIND) {
2457 const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002458 max_char = ucs1lib_find_max_char(u, u + len);
2459 if (max_char >= 128)
2460 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002461 }
2462 else if (kind == PyUnicode_2BYTE_KIND) {
2463 const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002464 max_char = ucs2lib_find_max_char(u, u + len);
2465 if (max_char >= 256)
2466 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002467 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002468 else if (kind == PyUnicode_4BYTE_KIND) {
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002469 const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
Antoine Pitroudd4e2f02011-10-13 00:02:27 +02002470 max_char = ucs4lib_find_max_char(u, u + len);
2471 if (max_char >= 0x10000)
2472 return;
Victor Stinner25a4b292011-10-06 12:31:55 +02002473 }
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002474 else
2475 Py_UNREACHABLE();
2476
Victor Stinner25a4b292011-10-06 12:31:55 +02002477 copy = PyUnicode_New(len, max_char);
Victor Stinnerca439ee2012-06-16 03:17:34 +02002478 if (copy != NULL)
2479 _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
Victor Stinner25a4b292011-10-06 12:31:55 +02002480 Py_DECREF(unicode);
2481 *p_unicode = copy;
2482}
2483
Victor Stinner034f6cf2011-09-30 02:26:44 +02002484PyObject*
Victor Stinnerbf6e5602011-12-12 01:53:47 +01002485_PyUnicode_Copy(PyObject *unicode)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002486{
Victor Stinner87af4f22011-11-21 23:03:47 +01002487 Py_ssize_t length;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002488 PyObject *copy;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002489
Victor Stinner034f6cf2011-09-30 02:26:44 +02002490 if (!PyUnicode_Check(unicode)) {
2491 PyErr_BadInternalCall();
2492 return NULL;
2493 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05002494 if (PyUnicode_READY(unicode) == -1)
Victor Stinner034f6cf2011-09-30 02:26:44 +02002495 return NULL;
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002496
Victor Stinner87af4f22011-11-21 23:03:47 +01002497 length = PyUnicode_GET_LENGTH(unicode);
2498 copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002499 if (!copy)
2500 return NULL;
2501 assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2502
Christian Heimesf051e432016-09-13 20:22:02 +02002503 memcpy(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
Victor Stinner87af4f22011-11-21 23:03:47 +01002504 length * PyUnicode_KIND(unicode));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +02002505 assert(_PyUnicode_CheckConsistency(copy, 1));
Victor Stinnerc841e7d2011-10-01 01:34:32 +02002506 return copy;
Victor Stinner034f6cf2011-09-30 02:26:44 +02002507}
2508
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002509
Victor Stinnerbc603d12011-10-02 01:00:40 +02002510/* Widen Unicode objects to larger buffers. Don't write terminating null
2511 character. Return NULL on error. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002512
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002513static void*
2514unicode_askind(unsigned int skind, void const *data, Py_ssize_t len, unsigned int kind)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002515{
Victor Stinnerbc603d12011-10-02 01:00:40 +02002516 void *result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002517
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002518 assert(skind < kind);
Benjamin Petersonead6b532011-12-20 17:23:42 -06002519 switch (kind) {
Victor Stinnerbc603d12011-10-02 01:00:40 +02002520 case PyUnicode_2BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002521 result = PyMem_New(Py_UCS2, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002522 if (!result)
2523 return PyErr_NoMemory();
2524 assert(skind == PyUnicode_1BYTE_KIND);
2525 _PyUnicode_CONVERT_BYTES(
2526 Py_UCS1, Py_UCS2,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002527 (const Py_UCS1 *)data,
2528 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002529 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002530 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002531 case PyUnicode_4BYTE_KIND:
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002532 result = PyMem_New(Py_UCS4, len);
Victor Stinnerbc603d12011-10-02 01:00:40 +02002533 if (!result)
2534 return PyErr_NoMemory();
2535 if (skind == PyUnicode_2BYTE_KIND) {
2536 _PyUnicode_CONVERT_BYTES(
2537 Py_UCS2, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002538 (const Py_UCS2 *)data,
2539 ((const Py_UCS2 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002540 result);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002541 }
Victor Stinnerbc603d12011-10-02 01:00:40 +02002542 else {
2543 assert(skind == PyUnicode_1BYTE_KIND);
2544 _PyUnicode_CONVERT_BYTES(
2545 Py_UCS1, Py_UCS4,
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002546 (const Py_UCS1 *)data,
2547 ((const Py_UCS1 *)data) + len,
Victor Stinnerbc603d12011-10-02 01:00:40 +02002548 result);
2549 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002550 return result;
Victor Stinnerbc603d12011-10-02 01:00:40 +02002551 default:
Serhiy Storchaka17b47332020-04-01 15:41:49 +03002552 Py_UNREACHABLE();
2553 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002554 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002555}
2556
2557static Py_UCS4*
2558as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2559 int copy_null)
2560{
2561 int kind;
2562 void *data;
2563 Py_ssize_t len, targetlen;
2564 if (PyUnicode_READY(string) == -1)
2565 return NULL;
2566 kind = PyUnicode_KIND(string);
2567 data = PyUnicode_DATA(string);
2568 len = PyUnicode_GET_LENGTH(string);
2569 targetlen = len;
2570 if (copy_null)
2571 targetlen++;
2572 if (!target) {
Serhiy Storchaka1a1ff292015-02-16 13:28:22 +02002573 target = PyMem_New(Py_UCS4, targetlen);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002574 if (!target) {
2575 PyErr_NoMemory();
2576 return NULL;
2577 }
2578 }
2579 else {
2580 if (targetsize < targetlen) {
2581 PyErr_Format(PyExc_SystemError,
2582 "string is longer than the buffer");
2583 if (copy_null && 0 < targetsize)
2584 target[0] = 0;
2585 return NULL;
2586 }
2587 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002588 if (kind == PyUnicode_1BYTE_KIND) {
2589 Py_UCS1 *start = (Py_UCS1 *) data;
2590 _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002591 }
Antoine Pitrou950468e2011-10-11 22:45:48 +02002592 else if (kind == PyUnicode_2BYTE_KIND) {
2593 Py_UCS2 *start = (Py_UCS2 *) data;
2594 _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2595 }
2596 else {
2597 assert(kind == PyUnicode_4BYTE_KIND);
Christian Heimesf051e432016-09-13 20:22:02 +02002598 memcpy(target, data, len * sizeof(Py_UCS4));
Antoine Pitrou950468e2011-10-11 22:45:48 +02002599 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002600 if (copy_null)
2601 target[len] = 0;
2602 return target;
2603}
2604
2605Py_UCS4*
2606PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2607 int copy_null)
2608{
Antoine Pitroude20b0b2011-11-10 21:47:38 +01002609 if (target == NULL || targetsize < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02002610 PyErr_BadInternalCall();
2611 return NULL;
2612 }
2613 return as_ucs4(string, target, targetsize, copy_null);
2614}
2615
2616Py_UCS4*
2617PyUnicode_AsUCS4Copy(PyObject *string)
2618{
2619 return as_ucs4(string, NULL, 0, 1);
2620}
2621
Victor Stinner15a11362012-10-06 23:48:20 +02002622/* maximum number of characters required for output of %lld or %p.
Victor Stinnere215d962012-10-06 23:03:36 +02002623 We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2624 plus 1 for the sign. 53/22 is an upper bound for log10(256). */
2625#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
Victor Stinner96865452011-03-01 23:44:09 +00002626
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002627static int
2628unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2629 Py_ssize_t width, Py_ssize_t precision)
2630{
2631 Py_ssize_t length, fill, arglen;
2632 Py_UCS4 maxchar;
2633
2634 if (PyUnicode_READY(str) == -1)
2635 return -1;
2636
2637 length = PyUnicode_GET_LENGTH(str);
2638 if ((precision == -1 || precision >= length)
2639 && width <= length)
2640 return _PyUnicodeWriter_WriteStr(writer, str);
2641
2642 if (precision != -1)
2643 length = Py_MIN(precision, length);
2644
2645 arglen = Py_MAX(length, width);
2646 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2647 maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2648 else
2649 maxchar = writer->maxchar;
2650
2651 if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2652 return -1;
2653
2654 if (width > length) {
2655 fill = width - length;
2656 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2657 return -1;
2658 writer->pos += fill;
2659 }
2660
2661 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2662 str, 0, length);
2663 writer->pos += length;
2664 return 0;
2665}
2666
2667static int
Victor Stinner998b8062018-09-12 00:23:25 +02002668unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002669 Py_ssize_t width, Py_ssize_t precision)
2670{
2671 /* UTF-8 */
2672 Py_ssize_t length;
2673 PyObject *unicode;
2674 int res;
2675
Serhiy Storchakad586ccb2019-01-12 10:30:35 +02002676 if (precision == -1) {
2677 length = strlen(str);
2678 }
2679 else {
2680 length = 0;
2681 while (length < precision && str[length]) {
2682 length++;
2683 }
2684 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002685 unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2686 if (unicode == NULL)
2687 return -1;
2688
2689 res = unicode_fromformat_write_str(writer, unicode, width, -1);
2690 Py_DECREF(unicode);
2691 return res;
2692}
2693
Victor Stinner96865452011-03-01 23:44:09 +00002694static const char*
Victor Stinnere215d962012-10-06 23:03:36 +02002695unicode_fromformat_arg(_PyUnicodeWriter *writer,
2696 const char *f, va_list *vargs)
Victor Stinner96865452011-03-01 23:44:09 +00002697{
Victor Stinnere215d962012-10-06 23:03:36 +02002698 const char *p;
2699 Py_ssize_t len;
2700 int zeropad;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002701 Py_ssize_t width;
2702 Py_ssize_t precision;
Victor Stinnere215d962012-10-06 23:03:36 +02002703 int longflag;
2704 int longlongflag;
2705 int size_tflag;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002706 Py_ssize_t fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002707
2708 p = f;
2709 f++;
Victor Stinner4c63a972012-10-06 23:55:33 +02002710 zeropad = 0;
2711 if (*f == '0') {
2712 zeropad = 1;
2713 f++;
2714 }
Victor Stinner96865452011-03-01 23:44:09 +00002715
2716 /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002717 width = -1;
2718 if (Py_ISDIGIT((unsigned)*f)) {
2719 width = *f - '0';
Victor Stinner96865452011-03-01 23:44:09 +00002720 f++;
Victor Stinnere215d962012-10-06 23:03:36 +02002721 while (Py_ISDIGIT((unsigned)*f)) {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002722 if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
Victor Stinner3921e902012-10-06 23:05:00 +02002723 PyErr_SetString(PyExc_ValueError,
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002724 "width too big");
Victor Stinner3921e902012-10-06 23:05:00 +02002725 return NULL;
2726 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002727 width = (width * 10) + (*f - '0');
Victor Stinnere215d962012-10-06 23:03:36 +02002728 f++;
2729 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002730 }
2731 precision = -1;
2732 if (*f == '.') {
2733 f++;
2734 if (Py_ISDIGIT((unsigned)*f)) {
2735 precision = (*f - '0');
2736 f++;
2737 while (Py_ISDIGIT((unsigned)*f)) {
2738 if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2739 PyErr_SetString(PyExc_ValueError,
2740 "precision too big");
2741 return NULL;
2742 }
2743 precision = (precision * 10) + (*f - '0');
2744 f++;
2745 }
2746 }
Victor Stinner96865452011-03-01 23:44:09 +00002747 if (*f == '%') {
2748 /* "%.3%s" => f points to "3" */
2749 f--;
2750 }
2751 }
2752 if (*f == '\0') {
Victor Stinnere215d962012-10-06 23:03:36 +02002753 /* bogus format "%.123" => go backward, f points to "3" */
Victor Stinner96865452011-03-01 23:44:09 +00002754 f--;
2755 }
Victor Stinner96865452011-03-01 23:44:09 +00002756
2757 /* Handle %ld, %lu, %lld and %llu. */
2758 longflag = 0;
2759 longlongflag = 0;
Victor Stinnere7faec12011-03-02 00:01:53 +00002760 size_tflag = 0;
Victor Stinner96865452011-03-01 23:44:09 +00002761 if (*f == 'l') {
Victor Stinner6d970f42011-03-02 00:04:25 +00002762 if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
Victor Stinner96865452011-03-01 23:44:09 +00002763 longflag = 1;
2764 ++f;
2765 }
Victor Stinner96865452011-03-01 23:44:09 +00002766 else if (f[1] == 'l' &&
Victor Stinner6d970f42011-03-02 00:04:25 +00002767 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002768 longlongflag = 1;
2769 f += 2;
2770 }
Victor Stinner96865452011-03-01 23:44:09 +00002771 }
2772 /* handle the size_t flag. */
Victor Stinner6d970f42011-03-02 00:04:25 +00002773 else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
Victor Stinner96865452011-03-01 23:44:09 +00002774 size_tflag = 1;
2775 ++f;
2776 }
Victor Stinnere215d962012-10-06 23:03:36 +02002777
2778 if (f[1] == '\0')
2779 writer->overallocate = 0;
2780
2781 switch (*f) {
2782 case 'c':
2783 {
2784 int ordinal = va_arg(*vargs, int);
Victor Stinnerff5a8482012-10-06 23:05:45 +02002785 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Serhiy Storchakac89533f2013-06-23 20:21:16 +03002786 PyErr_SetString(PyExc_OverflowError,
Victor Stinnerff5a8482012-10-06 23:05:45 +02002787 "character argument not in range(0x110000)");
2788 return NULL;
2789 }
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002790 if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002791 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002792 break;
2793 }
2794
2795 case 'i':
2796 case 'd':
2797 case 'u':
2798 case 'x':
2799 {
2800 /* used by sprintf */
Victor Stinner15a11362012-10-06 23:48:20 +02002801 char buffer[MAX_LONG_LONG_CHARS];
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002802 Py_ssize_t arglen;
Victor Stinnere215d962012-10-06 23:03:36 +02002803
2804 if (*f == 'u') {
Victor Stinnere215d962012-10-06 23:03:36 +02002805 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002806 len = sprintf(buffer, "%lu",
Victor Stinnere215d962012-10-06 23:03:36 +02002807 va_arg(*vargs, unsigned long));
Victor Stinnere215d962012-10-06 23:03:36 +02002808 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002809 len = sprintf(buffer, "%llu",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002810 va_arg(*vargs, unsigned long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002811 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002812 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
Victor Stinnere215d962012-10-06 23:03:36 +02002813 va_arg(*vargs, size_t));
2814 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002815 len = sprintf(buffer, "%u",
Victor Stinnere215d962012-10-06 23:03:36 +02002816 va_arg(*vargs, unsigned int));
2817 }
2818 else if (*f == 'x') {
Victor Stinner3aa979e2014-11-18 21:40:51 +01002819 len = sprintf(buffer, "%x", va_arg(*vargs, int));
Victor Stinnere215d962012-10-06 23:03:36 +02002820 }
2821 else {
Victor Stinnere215d962012-10-06 23:03:36 +02002822 if (longflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002823 len = sprintf(buffer, "%li",
Victor Stinnere215d962012-10-06 23:03:36 +02002824 va_arg(*vargs, long));
Victor Stinnere215d962012-10-06 23:03:36 +02002825 else if (longlongflag)
Benjamin Peterson47ff0732016-09-08 09:15:54 -07002826 len = sprintf(buffer, "%lli",
Benjamin Petersonaf580df2016-09-06 10:46:49 -07002827 va_arg(*vargs, long long));
Victor Stinnere215d962012-10-06 23:03:36 +02002828 else if (size_tflag)
Victor Stinner3aa979e2014-11-18 21:40:51 +01002829 len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
Victor Stinnere215d962012-10-06 23:03:36 +02002830 va_arg(*vargs, Py_ssize_t));
2831 else
Victor Stinner3aa979e2014-11-18 21:40:51 +01002832 len = sprintf(buffer, "%i",
Victor Stinnere215d962012-10-06 23:03:36 +02002833 va_arg(*vargs, int));
2834 }
2835 assert(len >= 0);
2836
Victor Stinnere215d962012-10-06 23:03:36 +02002837 if (precision < len)
2838 precision = len;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002839
2840 arglen = Py_MAX(precision, width);
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002841 if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2842 return NULL;
2843
Victor Stinnere215d962012-10-06 23:03:36 +02002844 if (width > precision) {
2845 Py_UCS4 fillchar;
2846 fill = width - precision;
2847 fillchar = zeropad?'0':' ';
Victor Stinner15a11362012-10-06 23:48:20 +02002848 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2849 return NULL;
2850 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002851 }
Victor Stinner15a11362012-10-06 23:48:20 +02002852 if (precision > len) {
Victor Stinnere215d962012-10-06 23:03:36 +02002853 fill = precision - len;
Victor Stinner15a11362012-10-06 23:48:20 +02002854 if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2855 return NULL;
2856 writer->pos += fill;
Victor Stinnere215d962012-10-06 23:03:36 +02002857 }
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002858
Victor Stinner4a587072013-11-19 12:54:53 +01002859 if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2860 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002861 break;
2862 }
2863
2864 case 'p':
2865 {
2866 char number[MAX_LONG_LONG_CHARS];
2867
2868 len = sprintf(number, "%p", va_arg(*vargs, void*));
2869 assert(len >= 0);
2870
2871 /* %p is ill-defined: ensure leading 0x. */
2872 if (number[1] == 'X')
2873 number[1] = 'x';
2874 else if (number[1] != 'x') {
2875 memmove(number + 2, number,
2876 strlen(number) + 1);
2877 number[0] = '0';
2878 number[1] = 'x';
2879 len += 2;
2880 }
2881
Victor Stinner4a587072013-11-19 12:54:53 +01002882 if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002883 return NULL;
2884 break;
2885 }
2886
2887 case 's':
2888 {
2889 /* UTF-8 */
2890 const char *s = va_arg(*vargs, const char*);
Victor Stinner998b8062018-09-12 00:23:25 +02002891 if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002892 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002893 break;
2894 }
2895
2896 case 'U':
2897 {
2898 PyObject *obj = va_arg(*vargs, PyObject *);
2899 assert(obj && _PyUnicode_CHECK(obj));
2900
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002901 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002902 return NULL;
2903 break;
2904 }
2905
2906 case 'V':
2907 {
2908 PyObject *obj = va_arg(*vargs, PyObject *);
2909 const char *str = va_arg(*vargs, const char *);
Victor Stinnere215d962012-10-06 23:03:36 +02002910 if (obj) {
2911 assert(_PyUnicode_CHECK(obj));
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002912 if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002913 return NULL;
2914 }
2915 else {
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002916 assert(str != NULL);
Victor Stinner998b8062018-09-12 00:23:25 +02002917 if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002918 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002919 }
2920 break;
2921 }
2922
2923 case 'S':
2924 {
2925 PyObject *obj = va_arg(*vargs, PyObject *);
2926 PyObject *str;
2927 assert(obj);
2928 str = PyObject_Str(obj);
2929 if (!str)
2930 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002931 if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002932 Py_DECREF(str);
2933 return NULL;
2934 }
2935 Py_DECREF(str);
2936 break;
2937 }
2938
2939 case 'R':
2940 {
2941 PyObject *obj = va_arg(*vargs, PyObject *);
2942 PyObject *repr;
2943 assert(obj);
2944 repr = PyObject_Repr(obj);
2945 if (!repr)
2946 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002947 if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002948 Py_DECREF(repr);
2949 return NULL;
2950 }
2951 Py_DECREF(repr);
2952 break;
2953 }
2954
2955 case 'A':
2956 {
2957 PyObject *obj = va_arg(*vargs, PyObject *);
2958 PyObject *ascii;
2959 assert(obj);
2960 ascii = PyObject_ASCII(obj);
2961 if (!ascii)
2962 return NULL;
Victor Stinner8cecc8c2013-05-06 23:11:54 +02002963 if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
Victor Stinnere215d962012-10-06 23:03:36 +02002964 Py_DECREF(ascii);
2965 return NULL;
2966 }
2967 Py_DECREF(ascii);
2968 break;
2969 }
2970
2971 case '%':
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02002972 if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02002973 return NULL;
Victor Stinnere215d962012-10-06 23:03:36 +02002974 break;
2975
2976 default:
2977 /* if we stumble upon an unknown formatting code, copy the rest
2978 of the format string to the output string. (we cannot just
2979 skip the code, since there's no way to know what's in the
2980 argument list) */
2981 len = strlen(p);
Victor Stinner4a587072013-11-19 12:54:53 +01002982 if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
Victor Stinnere215d962012-10-06 23:03:36 +02002983 return NULL;
2984 f = p+len;
2985 return f;
2986 }
2987
2988 f++;
Victor Stinner96865452011-03-01 23:44:09 +00002989 return f;
2990}
2991
Walter Dörwaldd2034312007-05-18 16:29:38 +00002992PyObject *
2993PyUnicode_FromFormatV(const char *format, va_list vargs)
2994{
Victor Stinnere215d962012-10-06 23:03:36 +02002995 va_list vargs2;
2996 const char *f;
2997 _PyUnicodeWriter writer;
Walter Dörwaldd2034312007-05-18 16:29:38 +00002998
Victor Stinner8f674cc2013-04-17 23:02:17 +02002999 _PyUnicodeWriter_Init(&writer);
3000 writer.min_length = strlen(format) + 100;
3001 writer.overallocate = 1;
Victor Stinnere215d962012-10-06 23:03:36 +02003002
Benjamin Peterson0c212142016-09-20 20:39:33 -07003003 // Copy varags to be able to pass a reference to a subfunction.
3004 va_copy(vargs2, vargs);
Victor Stinnere215d962012-10-06 23:03:36 +02003005
3006 for (f = format; *f; ) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00003007 if (*f == '%') {
Victor Stinnere215d962012-10-06 23:03:36 +02003008 f = unicode_fromformat_arg(&writer, f, &vargs2);
3009 if (f == NULL)
3010 goto fail;
Victor Stinner1205f272010-09-11 00:54:47 +00003011 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003012 else {
Victor Stinnere215d962012-10-06 23:03:36 +02003013 const char *p;
3014 Py_ssize_t len;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003015
Victor Stinnere215d962012-10-06 23:03:36 +02003016 p = f;
3017 do
3018 {
3019 if ((unsigned char)*p > 127) {
3020 PyErr_Format(PyExc_ValueError,
3021 "PyUnicode_FromFormatV() expects an ASCII-encoded format "
3022 "string, got a non-ASCII byte: 0x%02x",
3023 (unsigned char)*p);
Victor Stinner1ddf53d2016-09-21 14:13:14 +02003024 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003025 }
3026 p++;
3027 }
3028 while (*p != '\0' && *p != '%');
3029 len = p - f;
3030
3031 if (*p == '\0')
3032 writer.overallocate = 0;
Victor Stinner4a587072013-11-19 12:54:53 +01003033
3034 if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
Victor Stinnere215d962012-10-06 23:03:36 +02003035 goto fail;
Victor Stinnere215d962012-10-06 23:03:36 +02003036
3037 f = p;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003038 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00003039 }
Christian Heimes2f2fee12016-09-21 11:37:27 +02003040 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003041 return _PyUnicodeWriter_Finish(&writer);
3042
3043 fail:
Christian Heimes2f2fee12016-09-21 11:37:27 +02003044 va_end(vargs2);
Victor Stinnere215d962012-10-06 23:03:36 +02003045 _PyUnicodeWriter_Dealloc(&writer);
Benjamin Peterson14339b62009-01-31 16:36:08 +00003046 return NULL;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003047}
3048
Walter Dörwaldd2034312007-05-18 16:29:38 +00003049PyObject *
3050PyUnicode_FromFormat(const char *format, ...)
3051{
Benjamin Peterson14339b62009-01-31 16:36:08 +00003052 PyObject* ret;
3053 va_list vargs;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003054
3055#ifdef HAVE_STDARG_PROTOTYPES
Benjamin Peterson14339b62009-01-31 16:36:08 +00003056 va_start(vargs, format);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003057#else
Benjamin Peterson14339b62009-01-31 16:36:08 +00003058 va_start(vargs);
Walter Dörwaldd2034312007-05-18 16:29:38 +00003059#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +00003060 ret = PyUnicode_FromFormatV(format, vargs);
3061 va_end(vargs);
3062 return ret;
Walter Dörwaldd2034312007-05-18 16:29:38 +00003063}
3064
Serhiy Storchakac46db922018-10-23 22:58:24 +03003065static Py_ssize_t
3066unicode_get_widechar_size(PyObject *unicode)
3067{
3068 Py_ssize_t res;
3069
3070 assert(unicode != NULL);
3071 assert(_PyUnicode_CHECK(unicode));
3072
3073 if (_PyUnicode_WSTR(unicode) != NULL) {
3074 return PyUnicode_WSTR_LENGTH(unicode);
3075 }
3076 assert(PyUnicode_IS_READY(unicode));
3077
3078 res = _PyUnicode_LENGTH(unicode);
3079#if SIZEOF_WCHAR_T == 2
3080 if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3081 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3082 const Py_UCS4 *end = s + res;
3083 for (; s < end; ++s) {
3084 if (*s > 0xFFFF) {
3085 ++res;
3086 }
3087 }
3088 }
3089#endif
3090 return res;
3091}
3092
3093static void
3094unicode_copy_as_widechar(PyObject *unicode, wchar_t *w, Py_ssize_t size)
3095{
3096 const wchar_t *wstr;
3097
3098 assert(unicode != NULL);
3099 assert(_PyUnicode_CHECK(unicode));
3100
3101 wstr = _PyUnicode_WSTR(unicode);
3102 if (wstr != NULL) {
3103 memcpy(w, wstr, size * sizeof(wchar_t));
3104 return;
3105 }
3106 assert(PyUnicode_IS_READY(unicode));
3107
3108 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3109 const Py_UCS1 *s = PyUnicode_1BYTE_DATA(unicode);
3110 for (; size--; ++s, ++w) {
3111 *w = *s;
3112 }
3113 }
3114 else {
3115#if SIZEOF_WCHAR_T == 4
3116 assert(PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND);
3117 const Py_UCS2 *s = PyUnicode_2BYTE_DATA(unicode);
3118 for (; size--; ++s, ++w) {
3119 *w = *s;
3120 }
3121#else
3122 assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
3123 const Py_UCS4 *s = PyUnicode_4BYTE_DATA(unicode);
3124 for (; size--; ++s, ++w) {
3125 Py_UCS4 ch = *s;
3126 if (ch > 0xFFFF) {
3127 assert(ch <= MAX_UNICODE);
3128 /* encode surrogate pair in this case */
3129 *w++ = Py_UNICODE_HIGH_SURROGATE(ch);
3130 if (!size--)
3131 break;
3132 *w = Py_UNICODE_LOW_SURROGATE(ch);
3133 }
3134 else {
3135 *w = ch;
3136 }
3137 }
3138#endif
3139 }
3140}
3141
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003142#ifdef HAVE_WCHAR_H
3143
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003144/* Convert a Unicode object to a wide character string.
Victor Stinner5593d8a2010-10-02 11:11:27 +00003145
Victor Stinnerd88d9832011-09-06 02:00:05 +02003146 - If w is NULL: return the number of wide characters (including the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003147 character) required to convert the unicode object. Ignore size argument.
3148
Victor Stinnerd88d9832011-09-06 02:00:05 +02003149 - Otherwise: return the number of wide characters (excluding the null
Victor Stinner5593d8a2010-10-02 11:11:27 +00003150 character) written into w. Write at most size wide characters (including
Victor Stinnerd88d9832011-09-06 02:00:05 +02003151 the null character). */
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003152Py_ssize_t
3153PyUnicode_AsWideChar(PyObject *unicode,
3154 wchar_t *w,
3155 Py_ssize_t size)
Victor Stinner137c34c2010-09-29 10:25:54 +00003156{
Victor Stinner5593d8a2010-10-02 11:11:27 +00003157 Py_ssize_t res;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003158
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003159 if (unicode == NULL) {
3160 PyErr_BadInternalCall();
3161 return -1;
3162 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003163 if (!PyUnicode_Check(unicode)) {
3164 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003165 return -1;
Victor Stinner5593d8a2010-10-02 11:11:27 +00003166 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003167
3168 res = unicode_get_widechar_size(unicode);
3169 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003170 return res + 1;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003171 }
3172
3173 if (size > res) {
3174 size = res + 1;
3175 }
3176 else {
3177 res = size;
3178 }
3179 unicode_copy_as_widechar(unicode, w, size);
3180 return res;
Victor Stinner137c34c2010-09-29 10:25:54 +00003181}
3182
Victor Stinner137c34c2010-09-29 10:25:54 +00003183wchar_t*
Victor Stinnerbeb4135b2010-10-07 01:02:42 +00003184PyUnicode_AsWideCharString(PyObject *unicode,
Victor Stinner137c34c2010-09-29 10:25:54 +00003185 Py_ssize_t *size)
3186{
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003187 wchar_t *buffer;
Victor Stinner137c34c2010-09-29 10:25:54 +00003188 Py_ssize_t buflen;
3189
3190 if (unicode == NULL) {
3191 PyErr_BadInternalCall();
3192 return NULL;
3193 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003194 if (!PyUnicode_Check(unicode)) {
3195 PyErr_BadArgument();
Serhiy Storchakae613e6a2017-06-27 16:03:14 +03003196 return NULL;
3197 }
3198
Serhiy Storchakac46db922018-10-23 22:58:24 +03003199 buflen = unicode_get_widechar_size(unicode);
3200 buffer = (wchar_t *) PyMem_NEW(wchar_t, (buflen + 1));
Victor Stinner137c34c2010-09-29 10:25:54 +00003201 if (buffer == NULL) {
3202 PyErr_NoMemory();
3203 return NULL;
3204 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03003205 unicode_copy_as_widechar(unicode, buffer, buflen + 1);
3206 if (size != NULL) {
Victor Stinner5593d8a2010-10-02 11:11:27 +00003207 *size = buflen;
Serhiy Storchakac46db922018-10-23 22:58:24 +03003208 }
3209 else if (wcslen(buffer) != (size_t)buflen) {
3210 PyMem_FREE(buffer);
3211 PyErr_SetString(PyExc_ValueError,
3212 "embedded null character");
3213 return NULL;
3214 }
Victor Stinner137c34c2010-09-29 10:25:54 +00003215 return buffer;
3216}
3217
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003218#endif /* HAVE_WCHAR_H */
Guido van Rossumd57fd912000-03-10 22:53:23 +00003219
Alexander Belopolsky40018472011-02-26 01:02:56 +00003220PyObject *
3221PyUnicode_FromOrdinal(int ordinal)
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003222{
Victor Stinner8faf8212011-12-08 22:14:11 +01003223 if (ordinal < 0 || ordinal > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003224 PyErr_SetString(PyExc_ValueError,
3225 "chr() arg not in range(0x110000)");
3226 return NULL;
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003227 }
Guido van Rossum8ac004e2007-07-15 13:00:05 +00003228
Victor Stinner985a82a2014-01-03 12:53:47 +01003229 return unicode_char((Py_UCS4)ordinal);
Marc-André Lemburgcc8764c2002-08-11 12:23:04 +00003230}
3231
Alexander Belopolsky40018472011-02-26 01:02:56 +00003232PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003233PyUnicode_FromObject(PyObject *obj)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003234{
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003235 /* XXX Perhaps we should make this API an alias of
Benjamin Peterson29060642009-01-31 22:14:21 +00003236 PyObject_Str() instead ?! */
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003237 if (PyUnicode_CheckExact(obj)) {
Benjamin Petersonbac79492012-01-14 13:34:47 -05003238 if (PyUnicode_READY(obj) == -1)
Victor Stinnerd3a83d52011-10-01 03:09:33 +02003239 return NULL;
Benjamin Peterson29060642009-01-31 22:14:21 +00003240 Py_INCREF(obj);
3241 return obj;
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003242 }
3243 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003244 /* For a Unicode subtype that's not a Unicode object,
3245 return a true Unicode object with the same data. */
Victor Stinnerbf6e5602011-12-12 01:53:47 +01003246 return _PyUnicode_Copy(obj);
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003247 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00003248 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003249 "Can't convert '%.100s' object to str implicitly",
3250 Py_TYPE(obj)->tp_name);
Guido van Rossum98297ee2007-11-06 21:34:58 +00003251 return NULL;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003252}
3253
Alexander Belopolsky40018472011-02-26 01:02:56 +00003254PyObject *
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02003255PyUnicode_FromEncodedObject(PyObject *obj,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003256 const char *encoding,
3257 const char *errors)
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003258{
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003259 Py_buffer buffer;
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003260 PyObject *v;
Tim Petersced69f82003-09-16 20:30:58 +00003261
Guido van Rossumd57fd912000-03-10 22:53:23 +00003262 if (obj == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003263 PyErr_BadInternalCall();
3264 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003265 }
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003266
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003267 /* Decoding bytes objects is the most common case and should be fast */
3268 if (PyBytes_Check(obj)) {
Victor Stinner22eb6892019-06-26 00:51:05 +02003269 if (PyBytes_GET_SIZE(obj) == 0) {
3270 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3271 return NULL;
3272 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003273 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner22eb6892019-06-26 00:51:05 +02003274 }
3275 return PyUnicode_Decode(
Serhiy Storchaka05997252013-01-26 12:14:02 +02003276 PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
3277 encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003278 }
3279
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003280 if (PyUnicode_Check(obj)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00003281 PyErr_SetString(PyExc_TypeError,
3282 "decoding str is not supported");
3283 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +00003284 }
Guido van Rossumb8c65bc2001-10-19 02:01:31 +00003285
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003286 /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
3287 if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
3288 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003289 "decoding to str: need a bytes-like object, %.80s found",
3290 Py_TYPE(obj)->tp_name);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003291 return NULL;
Marc-André Lemburg6871f6a2001-09-20 12:53:16 +00003292 }
Tim Petersced69f82003-09-16 20:30:58 +00003293
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003294 if (buffer.len == 0) {
Serhiy Storchaka05997252013-01-26 12:14:02 +02003295 PyBuffer_Release(&buffer);
Victor Stinner22eb6892019-06-26 00:51:05 +02003296 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3297 return NULL;
3298 }
Serhiy Storchaka05997252013-01-26 12:14:02 +02003299 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +00003300 }
Marc-André Lemburgad7c98e2001-01-17 17:09:53 +00003301
Serhiy Storchaka05997252013-01-26 12:14:02 +02003302 v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
Antoine Pitroub0fa8312010-09-01 15:10:12 +00003303 PyBuffer_Release(&buffer);
Marc-André Lemburg5a5c81a2000-07-07 13:46:42 +00003304 return v;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003305}
3306
Victor Stinnerebe17e02016-10-12 13:57:45 +02003307/* Normalize an encoding name: similar to encodings.normalize_encoding(), but
3308 also convert to lowercase. Return 1 on success, or 0 on error (encoding is
3309 longer than lower_len-1). */
Victor Stinnerd45c7f82012-12-04 01:34:47 +01003310int
3311_Py_normalize_encoding(const char *encoding,
3312 char *lower,
3313 size_t lower_len)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003314{
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003315 const char *e;
Victor Stinner600d3be2010-06-10 12:00:55 +00003316 char *l;
3317 char *l_end;
Victor Stinner942889a2016-09-05 15:40:10 -07003318 int punct;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00003319
Victor Stinner942889a2016-09-05 15:40:10 -07003320 assert(encoding != NULL);
3321
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003322 e = encoding;
3323 l = lower;
Victor Stinner600d3be2010-06-10 12:00:55 +00003324 l_end = &lower[lower_len - 1];
Victor Stinner942889a2016-09-05 15:40:10 -07003325 punct = 0;
3326 while (1) {
3327 char c = *e;
3328 if (c == 0) {
3329 break;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003330 }
Victor Stinner942889a2016-09-05 15:40:10 -07003331
3332 if (Py_ISALNUM(c) || c == '.') {
3333 if (punct && l != lower) {
3334 if (l == l_end) {
3335 return 0;
3336 }
3337 *l++ = '_';
3338 }
3339 punct = 0;
3340
3341 if (l == l_end) {
3342 return 0;
3343 }
3344 *l++ = Py_TOLOWER(c);
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003345 }
3346 else {
Victor Stinner942889a2016-09-05 15:40:10 -07003347 punct = 1;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003348 }
Victor Stinner942889a2016-09-05 15:40:10 -07003349
3350 e++;
Guido van Rossumdaa251c2007-10-25 23:47:33 +00003351 }
3352 *l = '\0';
Victor Stinner37296e82010-06-10 13:36:23 +00003353 return 1;
Victor Stinner600d3be2010-06-10 12:00:55 +00003354}
3355
Alexander Belopolsky40018472011-02-26 01:02:56 +00003356PyObject *
3357PyUnicode_Decode(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003358 Py_ssize_t size,
3359 const char *encoding,
3360 const char *errors)
Victor Stinner600d3be2010-06-10 12:00:55 +00003361{
3362 PyObject *buffer = NULL, *unicode;
3363 Py_buffer info;
Victor Stinner942889a2016-09-05 15:40:10 -07003364 char buflower[11]; /* strlen("iso-8859-1\0") == 11, longest shortcut */
3365
Victor Stinner22eb6892019-06-26 00:51:05 +02003366 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3367 return NULL;
3368 }
3369
Victor Stinnered076ed2019-06-26 01:49:32 +02003370 if (size == 0) {
3371 _Py_RETURN_UNICODE_EMPTY();
3372 }
3373
Victor Stinner942889a2016-09-05 15:40:10 -07003374 if (encoding == NULL) {
3375 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3376 }
Victor Stinner600d3be2010-06-10 12:00:55 +00003377
Fred Drakee4315f52000-05-09 19:53:39 +00003378 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003379 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3380 char *lower = buflower;
3381
3382 /* Fast paths */
3383 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3384 lower += 3;
3385 if (*lower == '_') {
3386 /* Match "utf8" and "utf_8" */
3387 lower++;
3388 }
3389
3390 if (lower[0] == '8' && lower[1] == 0) {
3391 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3392 }
3393 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3394 return PyUnicode_DecodeUTF16(s, size, errors, 0);
3395 }
3396 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3397 return PyUnicode_DecodeUTF32(s, size, errors, 0);
3398 }
3399 }
3400 else {
3401 if (strcmp(lower, "ascii") == 0
3402 || strcmp(lower, "us_ascii") == 0) {
3403 return PyUnicode_DecodeASCII(s, size, errors);
3404 }
Steve Dowercc16be82016-09-08 10:35:16 -07003405 #ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003406 else if (strcmp(lower, "mbcs") == 0) {
3407 return PyUnicode_DecodeMBCS(s, size, errors);
3408 }
3409 #endif
3410 else if (strcmp(lower, "latin1") == 0
3411 || strcmp(lower, "latin_1") == 0
3412 || strcmp(lower, "iso_8859_1") == 0
3413 || strcmp(lower, "iso8859_1") == 0) {
3414 return PyUnicode_DecodeLatin1(s, size, errors);
3415 }
3416 }
Victor Stinner37296e82010-06-10 13:36:23 +00003417 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003418
3419 /* Decode via the codec registry */
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003420 buffer = NULL;
Benjamin Peterson95905ce2020-02-11 19:36:14 -08003421 if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
Guido van Rossumbe801ac2007-10-08 03:32:34 +00003422 goto onError;
Antoine Pitrouee58fa42008-08-19 18:22:14 +00003423 buffer = PyMemoryView_FromBuffer(&info);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003424 if (buffer == NULL)
3425 goto onError;
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003426 unicode = _PyCodec_DecodeText(buffer, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003427 if (unicode == NULL)
3428 goto onError;
3429 if (!PyUnicode_Check(unicode)) {
3430 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003431 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003432 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003433 encoding,
3434 Py_TYPE(unicode)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003435 Py_DECREF(unicode);
3436 goto onError;
3437 }
3438 Py_DECREF(buffer);
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003439 return unicode_result(unicode);
Tim Petersced69f82003-09-16 20:30:58 +00003440
Benjamin Peterson29060642009-01-31 22:14:21 +00003441 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003442 Py_XDECREF(buffer);
3443 return NULL;
3444}
3445
Alexander Belopolsky40018472011-02-26 01:02:56 +00003446PyObject *
3447PyUnicode_AsDecodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003448 const char *encoding,
3449 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003450{
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003451 if (!PyUnicode_Check(unicode)) {
3452 PyErr_BadArgument();
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003453 return NULL;
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003454 }
3455
Serhiy Storchaka00939072016-10-27 21:05:49 +03003456 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3457 "PyUnicode_AsDecodedObject() is deprecated; "
3458 "use PyCodec_Decode() to decode from str", 1) < 0)
3459 return NULL;
3460
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003461 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003462 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003463
3464 /* Decode via the codec registry */
Serhiy Storchaka77eede32016-10-25 10:07:51 +03003465 return PyCodec_Decode(unicode, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003466}
3467
Alexander Belopolsky40018472011-02-26 01:02:56 +00003468PyObject *
3469PyUnicode_AsDecodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003470 const char *encoding,
3471 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003472{
3473 PyObject *v;
3474
3475 if (!PyUnicode_Check(unicode)) {
3476 PyErr_BadArgument();
3477 goto onError;
3478 }
3479
Serhiy Storchaka00939072016-10-27 21:05:49 +03003480 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3481 "PyUnicode_AsDecodedUnicode() is deprecated; "
3482 "use PyCodec_Decode() to decode from str to str", 1) < 0)
3483 return NULL;
3484
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003485 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003486 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003487
3488 /* Decode via the codec registry */
3489 v = PyCodec_Decode(unicode, encoding, errors);
3490 if (v == NULL)
3491 goto onError;
3492 if (!PyUnicode_Check(v)) {
3493 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003494 "'%.400s' decoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003495 "use codecs.decode() to decode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003496 encoding,
3497 Py_TYPE(unicode)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003498 Py_DECREF(v);
3499 goto onError;
3500 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01003501 return unicode_result(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003502
Benjamin Peterson29060642009-01-31 22:14:21 +00003503 onError:
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003504 return NULL;
3505}
3506
Alexander Belopolsky40018472011-02-26 01:02:56 +00003507PyObject *
3508PyUnicode_Encode(const Py_UNICODE *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003509 Py_ssize_t size,
3510 const char *encoding,
3511 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003512{
3513 PyObject *v, *unicode;
Tim Petersced69f82003-09-16 20:30:58 +00003514
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02003515 unicode = PyUnicode_FromWideChar(s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003516 if (unicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003517 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003518 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3519 Py_DECREF(unicode);
3520 return v;
3521}
3522
Alexander Belopolsky40018472011-02-26 01:02:56 +00003523PyObject *
3524PyUnicode_AsEncodedObject(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003525 const char *encoding,
3526 const char *errors)
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003527{
3528 PyObject *v;
3529
3530 if (!PyUnicode_Check(unicode)) {
3531 PyErr_BadArgument();
3532 goto onError;
3533 }
3534
Serhiy Storchaka00939072016-10-27 21:05:49 +03003535 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3536 "PyUnicode_AsEncodedObject() is deprecated; "
3537 "use PyUnicode_AsEncodedString() to encode from str to bytes "
3538 "or PyCodec_Encode() for generic encoding", 1) < 0)
3539 return NULL;
3540
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003541 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003542 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003543
3544 /* Encode via the codec registry */
3545 v = PyCodec_Encode(unicode, encoding, errors);
3546 if (v == NULL)
3547 goto onError;
3548 return v;
3549
Benjamin Peterson29060642009-01-31 22:14:21 +00003550 onError:
Marc-André Lemburgd2d45982004-07-08 17:57:32 +00003551 return NULL;
3552}
3553
Victor Stinner1b579672011-12-17 05:47:23 +01003554
Victor Stinner2cba6b82018-01-10 22:46:15 +01003555static PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04003556unicode_encode_locale(PyObject *unicode, _Py_error_handler error_handler,
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003557 int current_locale)
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003558{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003559 Py_ssize_t wlen;
3560 wchar_t *wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3561 if (wstr == NULL) {
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003562 return NULL;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003563 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003564
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003565 if ((size_t)wlen != wcslen(wstr)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003566 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003567 PyMem_Free(wstr);
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003568 return NULL;
3569 }
3570
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003571 char *str;
3572 size_t error_pos;
3573 const char *reason;
3574 int res = _Py_EncodeLocaleEx(wstr, &str, &error_pos, &reason,
Victor Stinner3d4226a2018-08-29 22:21:32 +02003575 current_locale, error_handler);
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003576 PyMem_Free(wstr);
3577
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003578 if (res != 0) {
3579 if (res == -2) {
3580 PyObject *exc;
3581 exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnns",
3582 "locale", unicode,
3583 (Py_ssize_t)error_pos,
3584 (Py_ssize_t)(error_pos+1),
3585 reason);
3586 if (exc != NULL) {
3587 PyCodec_StrictErrors(exc);
3588 Py_DECREF(exc);
3589 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003590 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003591 else if (res == -3) {
3592 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3593 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003594 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003595 PyErr_NoMemory();
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003596 }
Victor Stinnerbde9d6b2018-11-28 10:26:20 +01003597 return NULL;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003598 }
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003599
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003600 PyObject *bytes = PyBytes_FromString(str);
3601 PyMem_RawFree(str);
3602 return bytes;
Victor Stinnerf2ea71f2011-12-17 04:13:41 +01003603}
3604
Victor Stinnerad158722010-10-27 00:25:46 +00003605PyObject *
Victor Stinner2cba6b82018-01-10 22:46:15 +01003606PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3607{
Victor Stinner709d23d2019-05-02 14:56:30 -04003608 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3609 return unicode_encode_locale(unicode, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003610}
3611
3612PyObject *
Victor Stinnerad158722010-10-27 00:25:46 +00003613PyUnicode_EncodeFSDefault(PyObject *unicode)
Victor Stinnerae6265f2010-05-15 16:27:27 +00003614{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003615 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003616 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003617 return unicode_encode_utf8(unicode,
3618 interp->fs_codec.error_handler,
3619 interp->fs_codec.errors);
3620 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003621#ifndef _Py_FORCE_UTF8_FS_ENCODING
3622 else if (interp->fs_codec.encoding) {
Victor Stinnerae6265f2010-05-15 16:27:27 +00003623 return PyUnicode_AsEncodedString(unicode,
Victor Stinner709d23d2019-05-02 14:56:30 -04003624 interp->fs_codec.encoding,
3625 interp->fs_codec.errors);
Victor Stinnerc39211f2010-09-29 16:35:47 +00003626 }
Victor Stinnerad158722010-10-27 00:25:46 +00003627#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003628 else {
3629 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3630 machinery is not ready and so cannot be used:
3631 use wcstombs() in this case. */
3632 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3633 assert(filesystem_errors != NULL);
3634 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3635 assert(errors != _Py_ERROR_UNKNOWN);
3636#ifdef _Py_FORCE_UTF8_FS_ENCODING
3637 return unicode_encode_utf8(unicode, errors, NULL);
3638#else
3639 return unicode_encode_locale(unicode, errors, 0);
3640#endif
3641 }
Victor Stinnerae6265f2010-05-15 16:27:27 +00003642}
3643
Alexander Belopolsky40018472011-02-26 01:02:56 +00003644PyObject *
3645PyUnicode_AsEncodedString(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003646 const char *encoding,
3647 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00003648{
3649 PyObject *v;
Victor Stinner942889a2016-09-05 15:40:10 -07003650 char buflower[11]; /* strlen("iso_8859_1\0") == 11, longest shortcut */
Tim Petersced69f82003-09-16 20:30:58 +00003651
Guido van Rossumd57fd912000-03-10 22:53:23 +00003652 if (!PyUnicode_Check(unicode)) {
3653 PyErr_BadArgument();
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003654 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00003655 }
Fred Drakee4315f52000-05-09 19:53:39 +00003656
Victor Stinner22eb6892019-06-26 00:51:05 +02003657 if (unicode_check_encoding_errors(encoding, errors) < 0) {
3658 return NULL;
3659 }
3660
Victor Stinner942889a2016-09-05 15:40:10 -07003661 if (encoding == NULL) {
3662 return _PyUnicode_AsUTF8String(unicode, errors);
3663 }
3664
Fred Drakee4315f52000-05-09 19:53:39 +00003665 /* Shortcuts for common default encodings */
Victor Stinner942889a2016-09-05 15:40:10 -07003666 if (_Py_normalize_encoding(encoding, buflower, sizeof(buflower))) {
3667 char *lower = buflower;
3668
3669 /* Fast paths */
3670 if (lower[0] == 'u' && lower[1] == 't' && lower[2] == 'f') {
3671 lower += 3;
3672 if (*lower == '_') {
3673 /* Match "utf8" and "utf_8" */
3674 lower++;
3675 }
3676
3677 if (lower[0] == '8' && lower[1] == 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003678 return _PyUnicode_AsUTF8String(unicode, errors);
Victor Stinner942889a2016-09-05 15:40:10 -07003679 }
3680 else if (lower[0] == '1' && lower[1] == '6' && lower[2] == 0) {
3681 return _PyUnicode_EncodeUTF16(unicode, errors, 0);
3682 }
3683 else if (lower[0] == '3' && lower[1] == '2' && lower[2] == 0) {
3684 return _PyUnicode_EncodeUTF32(unicode, errors, 0);
3685 }
Victor Stinnera5c68c32011-03-02 01:03:14 +00003686 }
Victor Stinner942889a2016-09-05 15:40:10 -07003687 else {
3688 if (strcmp(lower, "ascii") == 0
3689 || strcmp(lower, "us_ascii") == 0) {
3690 return _PyUnicode_AsASCIIString(unicode, errors);
3691 }
Steve Dowercc16be82016-09-08 10:35:16 -07003692#ifdef MS_WINDOWS
Victor Stinner942889a2016-09-05 15:40:10 -07003693 else if (strcmp(lower, "mbcs") == 0) {
3694 return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3695 }
Mark Hammond0ccda1e2003-07-01 00:13:27 +00003696#endif
Victor Stinner942889a2016-09-05 15:40:10 -07003697 else if (strcmp(lower, "latin1") == 0 ||
3698 strcmp(lower, "latin_1") == 0 ||
3699 strcmp(lower, "iso_8859_1") == 0 ||
3700 strcmp(lower, "iso8859_1") == 0) {
3701 return _PyUnicode_AsLatin1String(unicode, errors);
3702 }
3703 }
Victor Stinner37296e82010-06-10 13:36:23 +00003704 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003705
3706 /* Encode via the codec registry */
Nick Coghlanc72e4e62013-11-22 22:39:36 +10003707 v = _PyCodec_EncodeText(unicode, encoding, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00003708 if (v == NULL)
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003709 return NULL;
3710
3711 /* The normal path */
3712 if (PyBytes_Check(v))
3713 return v;
3714
3715 /* If the codec returns a buffer, raise a warning and convert to bytes */
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003716 if (PyByteArray_Check(v)) {
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003717 int error;
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003718 PyObject *b;
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003719
3720 error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
Nick Coghlan8b097b42013-11-13 23:49:21 +10003721 "encoder %s returned bytearray instead of bytes; "
3722 "use codecs.encode() to encode to arbitrary types",
Victor Stinner4a2b7a12010-08-13 14:03:48 +00003723 encoding);
3724 if (error) {
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003725 Py_DECREF(v);
3726 return NULL;
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003727 }
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003728
Serhiy Storchakafff9a312017-03-21 08:53:25 +02003729 b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v),
3730 PyByteArray_GET_SIZE(v));
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003731 Py_DECREF(v);
3732 return b;
3733 }
3734
3735 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003736 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003737 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003738 encoding,
3739 Py_TYPE(v)->tp_name);
Amaury Forgeot d'Arcf0481112008-09-05 20:48:47 +00003740 Py_DECREF(v);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003741 return NULL;
3742}
3743
Alexander Belopolsky40018472011-02-26 01:02:56 +00003744PyObject *
3745PyUnicode_AsEncodedUnicode(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03003746 const char *encoding,
3747 const char *errors)
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003748{
3749 PyObject *v;
3750
3751 if (!PyUnicode_Check(unicode)) {
3752 PyErr_BadArgument();
3753 goto onError;
3754 }
3755
Serhiy Storchaka00939072016-10-27 21:05:49 +03003756 if (PyErr_WarnEx(PyExc_DeprecationWarning,
3757 "PyUnicode_AsEncodedUnicode() is deprecated; "
3758 "use PyCodec_Encode() to encode from str to str", 1) < 0)
3759 return NULL;
3760
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003761 if (encoding == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00003762 encoding = PyUnicode_GetDefaultEncoding();
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003763
3764 /* Encode via the codec registry */
3765 v = PyCodec_Encode(unicode, encoding, errors);
3766 if (v == NULL)
3767 goto onError;
3768 if (!PyUnicode_Check(v)) {
3769 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003770 "'%.400s' encoder returned '%.400s' instead of 'str'; "
Nick Coghlan8b097b42013-11-13 23:49:21 +10003771 "use codecs.encode() to encode to arbitrary types",
Victor Stinner998b8062018-09-12 00:23:25 +02003772 encoding,
3773 Py_TYPE(v)->tp_name);
Marc-André Lemburgb2750b52008-06-06 12:18:17 +00003774 Py_DECREF(v);
3775 goto onError;
3776 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00003777 return v;
Tim Petersced69f82003-09-16 20:30:58 +00003778
Benjamin Peterson29060642009-01-31 22:14:21 +00003779 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00003780 return NULL;
3781}
3782
Victor Stinner2cba6b82018-01-10 22:46:15 +01003783static PyObject*
Victor Stinner709d23d2019-05-02 14:56:30 -04003784unicode_decode_locale(const char *str, Py_ssize_t len,
3785 _Py_error_handler errors, int current_locale)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003786{
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003787 if (str[len] != '\0' || (size_t)len != strlen(str)) {
3788 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003789 return NULL;
3790 }
3791
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003792 wchar_t *wstr;
3793 size_t wlen;
3794 const char *reason;
3795 int res = _Py_DecodeLocaleEx(str, &wstr, &wlen, &reason,
Victor Stinner709d23d2019-05-02 14:56:30 -04003796 current_locale, errors);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003797 if (res != 0) {
3798 if (res == -2) {
3799 PyObject *exc;
3800 exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nns",
3801 "locale", str, len,
3802 (Py_ssize_t)wlen,
3803 (Py_ssize_t)(wlen + 1),
3804 reason);
3805 if (exc != NULL) {
3806 PyCodec_StrictErrors(exc);
3807 Py_DECREF(exc);
3808 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003809 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02003810 else if (res == -3) {
3811 PyErr_SetString(PyExc_ValueError, "unsupported error handler");
3812 }
Victor Stinner2cba6b82018-01-10 22:46:15 +01003813 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003814 PyErr_NoMemory();
Victor Stinner2cba6b82018-01-10 22:46:15 +01003815 }
Victor Stinner2f197072011-12-17 07:08:30 +01003816 return NULL;
Victor Stinner2f197072011-12-17 07:08:30 +01003817 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01003818
3819 PyObject *unicode = PyUnicode_FromWideChar(wstr, wlen);
3820 PyMem_RawFree(wstr);
3821 return unicode;
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003822}
3823
3824PyObject*
Victor Stinner2cba6b82018-01-10 22:46:15 +01003825PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3826 const char *errors)
3827{
Victor Stinner709d23d2019-05-02 14:56:30 -04003828 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3829 return unicode_decode_locale(str, len, error_handler, 1);
Victor Stinner2cba6b82018-01-10 22:46:15 +01003830}
3831
3832PyObject*
Victor Stinner1b579672011-12-17 05:47:23 +01003833PyUnicode_DecodeLocale(const char *str, const char *errors)
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003834{
3835 Py_ssize_t size = (Py_ssize_t)strlen(str);
Victor Stinner709d23d2019-05-02 14:56:30 -04003836 _Py_error_handler error_handler = _Py_GetErrorHandler(errors);
3837 return unicode_decode_locale(str, size, error_handler, 1);
Victor Stinneraf02e1c2011-12-16 23:56:01 +01003838}
3839
3840
3841PyObject*
Christian Heimes5894ba72007-11-04 11:43:14 +00003842PyUnicode_DecodeFSDefault(const char *s) {
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003843 Py_ssize_t size = (Py_ssize_t)strlen(s);
Christian Heimes5894ba72007-11-04 11:43:14 +00003844 return PyUnicode_DecodeFSDefaultAndSize(s, size);
3845}
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003846
Christian Heimes5894ba72007-11-04 11:43:14 +00003847PyObject*
3848PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3849{
Victor Stinnercaba55b2018-08-03 15:33:52 +02003850 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003851 if (interp->fs_codec.utf8) {
Victor Stinner709d23d2019-05-02 14:56:30 -04003852 return unicode_decode_utf8(s, size,
3853 interp->fs_codec.error_handler,
3854 interp->fs_codec.errors,
3855 NULL);
3856 }
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003857#ifndef _Py_FORCE_UTF8_FS_ENCODING
3858 else if (interp->fs_codec.encoding) {
Steve Dower78057b42016-11-06 19:35:08 -08003859 return PyUnicode_Decode(s, size,
Victor Stinner709d23d2019-05-02 14:56:30 -04003860 interp->fs_codec.encoding,
3861 interp->fs_codec.errors);
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003862 }
Victor Stinnerad158722010-10-27 00:25:46 +00003863#endif
Victor Stinnerbf305cc2020-02-05 17:39:57 +01003864 else {
3865 /* Before _PyUnicode_InitEncodings() is called, the Python codec
3866 machinery is not ready and so cannot be used:
3867 use mbstowcs() in this case. */
3868 const wchar_t *filesystem_errors = interp->config.filesystem_errors;
3869 assert(filesystem_errors != NULL);
3870 _Py_error_handler errors = get_error_handler_wide(filesystem_errors);
3871 assert(errors != _Py_ERROR_UNKNOWN);
3872#ifdef _Py_FORCE_UTF8_FS_ENCODING
3873 return unicode_decode_utf8(s, size, errors, NULL, NULL);
3874#else
3875 return unicode_decode_locale(s, size, errors, 0);
3876#endif
3877 }
Guido van Rossum00bc0e02007-10-15 02:52:41 +00003878}
3879
Martin v. Löwis011e8422009-05-05 04:43:17 +00003880
3881int
3882PyUnicode_FSConverter(PyObject* arg, void* addr)
3883{
Brett Cannonec6ce872016-09-06 15:50:29 -07003884 PyObject *path = NULL;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003885 PyObject *output = NULL;
3886 Py_ssize_t size;
3887 void *data;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003888 if (arg == NULL) {
3889 Py_DECREF(*(PyObject**)addr);
Benjamin Petersona4d33b32015-11-15 21:57:39 -08003890 *(PyObject**)addr = NULL;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003891 return 1;
3892 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003893 path = PyOS_FSPath(arg);
3894 if (path == NULL) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03003895 return 0;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003896 }
Brett Cannonec6ce872016-09-06 15:50:29 -07003897 if (PyBytes_Check(path)) {
3898 output = path;
3899 }
3900 else { // PyOS_FSPath() guarantees its returned value is bytes or str.
3901 output = PyUnicode_EncodeFSDefault(path);
3902 Py_DECREF(path);
3903 if (!output) {
3904 return 0;
3905 }
3906 assert(PyBytes_Check(output));
3907 }
3908
Victor Stinner0ea2a462010-04-30 00:22:08 +00003909 size = PyBytes_GET_SIZE(output);
3910 data = PyBytes_AS_STRING(output);
Victor Stinner12174a52014-08-15 23:17:38 +02003911 if ((size_t)size != strlen(data)) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003912 PyErr_SetString(PyExc_ValueError, "embedded null byte");
Martin v. Löwis011e8422009-05-05 04:43:17 +00003913 Py_DECREF(output);
3914 return 0;
3915 }
3916 *(PyObject**)addr = output;
Martin v. Löwisc15bdef2009-05-29 14:47:46 +00003917 return Py_CLEANUP_SUPPORTED;
Martin v. Löwis011e8422009-05-05 04:43:17 +00003918}
3919
3920
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003921int
3922PyUnicode_FSDecoder(PyObject* arg, void* addr)
3923{
Brett Cannona5711202016-09-06 19:36:01 -07003924 int is_buffer = 0;
3925 PyObject *path = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003926 PyObject *output = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003927 if (arg == NULL) {
3928 Py_DECREF(*(PyObject**)addr);
Serhiy Storchaka40db90c2017-04-20 21:19:31 +03003929 *(PyObject**)addr = NULL;
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003930 return 1;
3931 }
Brett Cannona5711202016-09-06 19:36:01 -07003932
3933 is_buffer = PyObject_CheckBuffer(arg);
3934 if (!is_buffer) {
3935 path = PyOS_FSPath(arg);
3936 if (path == NULL) {
Serhiy Storchakafebc3322016-08-06 23:29:29 +03003937 return 0;
3938 }
Brett Cannona5711202016-09-06 19:36:01 -07003939 }
3940 else {
3941 path = arg;
3942 Py_INCREF(arg);
3943 }
3944
3945 if (PyUnicode_Check(path)) {
Brett Cannona5711202016-09-06 19:36:01 -07003946 output = path;
3947 }
3948 else if (PyBytes_Check(path) || is_buffer) {
3949 PyObject *path_bytes = NULL;
3950
3951 if (!PyBytes_Check(path) &&
3952 PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
Victor Stinner998b8062018-09-12 00:23:25 +02003953 "path should be string, bytes, or os.PathLike, not %.200s",
3954 Py_TYPE(arg)->tp_name)) {
3955 Py_DECREF(path);
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003956 return 0;
Brett Cannona5711202016-09-06 19:36:01 -07003957 }
3958 path_bytes = PyBytes_FromObject(path);
3959 Py_DECREF(path);
3960 if (!path_bytes) {
3961 return 0;
3962 }
3963 output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(path_bytes),
3964 PyBytes_GET_SIZE(path_bytes));
3965 Py_DECREF(path_bytes);
3966 if (!output) {
3967 return 0;
3968 }
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003969 }
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003970 else {
3971 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +02003972 "path should be string, bytes, or os.PathLike, not %.200s",
3973 Py_TYPE(arg)->tp_name);
Brett Cannona5711202016-09-06 19:36:01 -07003974 Py_DECREF(path);
Serhiy Storchaka9305d832016-06-18 13:53:36 +03003975 return 0;
3976 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05003977 if (PyUnicode_READY(output) == -1) {
Victor Stinner065836e2011-10-27 01:56:33 +02003978 Py_DECREF(output);
3979 return 0;
3980 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003981 if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02003982 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
Serhiy Storchakad8a14472014-09-06 20:07:17 +03003983 PyErr_SetString(PyExc_ValueError, "embedded null character");
Victor Stinner47fcb5b2010-08-13 23:59:58 +00003984 Py_DECREF(output);
3985 return 0;
3986 }
3987 *(PyObject**)addr = output;
3988 return Py_CLEANUP_SUPPORTED;
3989}
3990
3991
Inada Naoki02a4d572020-02-27 13:48:59 +09003992static int unicode_fill_utf8(PyObject *unicode);
3993
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02003994const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02003995PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
Martin v. Löwis5b222132007-06-10 09:51:05 +00003996{
Neal Norwitze0a0a6e2007-08-25 01:04:21 +00003997 if (!PyUnicode_Check(unicode)) {
3998 PyErr_BadArgument();
3999 return NULL;
4000 }
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004001 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis5b222132007-06-10 09:51:05 +00004002 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004003
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004004 if (PyUnicode_UTF8(unicode) == NULL) {
Inada Naoki02a4d572020-02-27 13:48:59 +09004005 if (unicode_fill_utf8(unicode) == -1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004006 return NULL;
4007 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004008 }
4009
4010 if (psize)
Victor Stinnere90fe6a2011-10-01 16:48:13 +02004011 *psize = PyUnicode_UTF8_LENGTH(unicode);
4012 return PyUnicode_UTF8(unicode);
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004013}
4014
Serhiy Storchaka2a404b62017-01-22 23:07:07 +02004015const char *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004016PyUnicode_AsUTF8(PyObject *unicode)
Guido van Rossum7d1df6c2007-08-29 13:53:23 +00004017{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004018 return PyUnicode_AsUTF8AndSize(unicode, NULL);
4019}
4020
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004021Py_UNICODE *
4022PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
4023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004024 if (!PyUnicode_Check(unicode)) {
4025 PyErr_BadArgument();
4026 return NULL;
4027 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004028 Py_UNICODE *w = _PyUnicode_WSTR(unicode);
4029 if (w == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004030 /* Non-ASCII compact unicode object */
Serhiy Storchakac46db922018-10-23 22:58:24 +03004031 assert(_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND);
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004032 assert(PyUnicode_IS_READY(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004033
Serhiy Storchakac46db922018-10-23 22:58:24 +03004034 Py_ssize_t wlen = unicode_get_widechar_size(unicode);
4035 if ((size_t)wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
4036 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004037 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004038 }
Serhiy Storchakac46db922018-10-23 22:58:24 +03004039 w = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) * (wlen + 1));
4040 if (w == NULL) {
4041 PyErr_NoMemory();
4042 return NULL;
4043 }
4044 unicode_copy_as_widechar(unicode, w, wlen + 1);
4045 _PyUnicode_WSTR(unicode) = w;
4046 if (!PyUnicode_IS_COMPACT_ASCII(unicode)) {
4047 _PyUnicode_WSTR_LENGTH(unicode) = wlen;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004048 }
4049 }
4050 if (size != NULL)
Victor Stinner9db1a8b2011-10-23 20:04:37 +02004051 *size = PyUnicode_WSTR_LENGTH(unicode);
Serhiy Storchakac46db922018-10-23 22:58:24 +03004052 return w;
Martin v. Löwis5b222132007-06-10 09:51:05 +00004053}
4054
Alexander Belopolsky40018472011-02-26 01:02:56 +00004055Py_UNICODE *
4056PyUnicode_AsUnicode(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004057{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004058 return PyUnicode_AsUnicodeAndSize(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004059}
4060
Serhiy Storchakaf7eae0a2017-06-28 08:30:06 +03004061const Py_UNICODE *
4062_PyUnicode_AsUnicode(PyObject *unicode)
4063{
4064 Py_ssize_t size;
4065 const Py_UNICODE *wstr;
4066
4067 wstr = PyUnicode_AsUnicodeAndSize(unicode, &size);
4068 if (wstr && wcslen(wstr) != (size_t)size) {
4069 PyErr_SetString(PyExc_ValueError, "embedded null character");
4070 return NULL;
4071 }
4072 return wstr;
4073}
4074
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004075
Alexander Belopolsky40018472011-02-26 01:02:56 +00004076Py_ssize_t
4077PyUnicode_GetSize(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004078{
4079 if (!PyUnicode_Check(unicode)) {
4080 PyErr_BadArgument();
4081 goto onError;
4082 }
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004083 if (_PyUnicode_WSTR(unicode) == NULL) {
4084 if (PyUnicode_AsUnicode(unicode) == NULL)
4085 goto onError;
4086 }
4087 return PyUnicode_WSTR_LENGTH(unicode);
Guido van Rossumd57fd912000-03-10 22:53:23 +00004088
Benjamin Peterson29060642009-01-31 22:14:21 +00004089 onError:
Guido van Rossumd57fd912000-03-10 22:53:23 +00004090 return -1;
4091}
4092
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004093Py_ssize_t
4094PyUnicode_GetLength(PyObject *unicode)
4095{
Victor Stinner07621332012-06-16 04:53:46 +02004096 if (!PyUnicode_Check(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004097 PyErr_BadArgument();
4098 return -1;
4099 }
Victor Stinner07621332012-06-16 04:53:46 +02004100 if (PyUnicode_READY(unicode) == -1)
4101 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004102 return PyUnicode_GET_LENGTH(unicode);
4103}
4104
4105Py_UCS4
4106PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
4107{
Victor Stinner69ed0f42013-04-09 21:48:24 +02004108 void *data;
4109 int kind;
4110
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004111 if (!PyUnicode_Check(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004112 PyErr_BadArgument();
4113 return (Py_UCS4)-1;
4114 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +03004115 if (PyUnicode_READY(unicode) == -1) {
4116 return (Py_UCS4)-1;
4117 }
Victor Stinnerc4b49542011-12-11 22:44:26 +01004118 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinner2fe5ced2011-10-02 00:25:40 +02004119 PyErr_SetString(PyExc_IndexError, "string index out of range");
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004120 return (Py_UCS4)-1;
4121 }
Victor Stinner69ed0f42013-04-09 21:48:24 +02004122 data = PyUnicode_DATA(unicode);
4123 kind = PyUnicode_KIND(unicode);
4124 return PyUnicode_READ(kind, data, index);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004125}
4126
4127int
4128PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
4129{
4130 if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004131 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004132 return -1;
4133 }
Victor Stinner488fa492011-12-12 00:01:39 +01004134 assert(PyUnicode_IS_READY(unicode));
Victor Stinnerc4b49542011-12-11 22:44:26 +01004135 if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnercd9950f2011-10-02 00:34:53 +02004136 PyErr_SetString(PyExc_IndexError, "string index out of range");
4137 return -1;
4138 }
Victor Stinner488fa492011-12-12 00:01:39 +01004139 if (unicode_check_modifiable(unicode))
Victor Stinnercd9950f2011-10-02 00:34:53 +02004140 return -1;
Victor Stinnerc9590ad2012-03-04 01:34:37 +01004141 if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
4142 PyErr_SetString(PyExc_ValueError, "character out of range");
4143 return -1;
4144 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004145 PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
4146 index, ch);
4147 return 0;
4148}
4149
Alexander Belopolsky40018472011-02-26 01:02:56 +00004150const char *
4151PyUnicode_GetDefaultEncoding(void)
Fred Drakee4315f52000-05-09 19:53:39 +00004152{
Victor Stinner42cb4622010-09-01 19:39:01 +00004153 return "utf-8";
Fred Drakee4315f52000-05-09 19:53:39 +00004154}
4155
Victor Stinner554f3f02010-06-16 23:33:54 +00004156/* create or adjust a UnicodeDecodeError */
4157static void
4158make_decode_exception(PyObject **exceptionObject,
4159 const char *encoding,
4160 const char *input, Py_ssize_t length,
4161 Py_ssize_t startpos, Py_ssize_t endpos,
4162 const char *reason)
4163{
4164 if (*exceptionObject == NULL) {
4165 *exceptionObject = PyUnicodeDecodeError_Create(
4166 encoding, input, length, startpos, endpos, reason);
4167 }
4168 else {
4169 if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4170 goto onError;
4171 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4172 goto onError;
4173 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4174 goto onError;
4175 }
4176 return;
4177
4178onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02004179 Py_CLEAR(*exceptionObject);
Victor Stinner554f3f02010-06-16 23:33:54 +00004180}
4181
Steve Dowercc16be82016-09-08 10:35:16 -07004182#ifdef MS_WINDOWS
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004183static int
4184widechar_resize(wchar_t **buf, Py_ssize_t *size, Py_ssize_t newsize)
4185{
4186 if (newsize > *size) {
4187 wchar_t *newbuf = *buf;
4188 if (PyMem_Resize(newbuf, wchar_t, newsize) == NULL) {
4189 PyErr_NoMemory();
4190 return -1;
4191 }
4192 *buf = newbuf;
4193 }
4194 *size = newsize;
4195 return 0;
4196}
4197
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004198/* error handling callback helper:
4199 build arguments, call the callback and check the arguments,
Fred Drakedb390c12005-10-28 14:39:47 +00004200 if no exception occurred, copy the replacement to the output
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004201 and adjust various state variables.
4202 return 0 on success, -1 on error
4203*/
4204
Alexander Belopolsky40018472011-02-26 01:02:56 +00004205static int
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004206unicode_decode_call_errorhandler_wchar(
4207 const char *errors, PyObject **errorHandler,
4208 const char *encoding, const char *reason,
4209 const char **input, const char **inend, Py_ssize_t *startinpos,
4210 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004211 wchar_t **buf, Py_ssize_t *bufsize, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004212{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004213 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004214
4215 PyObject *restuple = NULL;
4216 PyObject *repunicode = NULL;
Victor Stinner596a6c42011-11-09 00:02:18 +01004217 Py_ssize_t outsize;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004218 Py_ssize_t insize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004219 Py_ssize_t requiredsize;
4220 Py_ssize_t newpos;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004221 PyObject *inputobj = NULL;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004222 wchar_t *repwstr;
4223 Py_ssize_t repwlen;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004224
4225 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00004226 *errorHandler = PyCodec_LookupError(errors);
4227 if (*errorHandler == NULL)
4228 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004229 }
4230
Victor Stinner554f3f02010-06-16 23:33:54 +00004231 make_decode_exception(exceptionObject,
4232 encoding,
4233 *input, *inend - *input,
4234 *startinpos, *endinpos,
4235 reason);
4236 if (*exceptionObject == NULL)
4237 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004238
Petr Viktorinffd97532020-02-11 17:46:57 +01004239 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004240 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00004241 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004242 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004243 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00004244 goto onError;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004245 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004246 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Benjamin Peterson29060642009-01-31 22:14:21 +00004247 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004248
4249 /* Copy back the bytes variables, which might have been modified by the
4250 callback */
4251 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4252 if (!inputobj)
4253 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004254 *input = PyBytes_AS_STRING(inputobj);
4255 insize = PyBytes_GET_SIZE(inputobj);
4256 *inend = *input + insize;
4257 /* we can DECREF safely, as the exception has another reference,
4258 so the object won't go away. */
4259 Py_DECREF(inputobj);
4260
4261 if (newpos<0)
4262 newpos = insize+newpos;
4263 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004264 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004265 goto onError;
4266 }
4267
4268 repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4269 if (repwstr == NULL)
4270 goto onError;
4271 /* need more space? (at least enough for what we
4272 have+the replacement+the rest of the string (starting
4273 at the new input position), so we won't have to check space
4274 when there are no errors in the rest of the string) */
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004275 requiredsize = *outpos;
4276 if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4277 goto overflow;
4278 requiredsize += repwlen;
4279 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4280 goto overflow;
4281 requiredsize += insize - newpos;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004282 outsize = *bufsize;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004283 if (requiredsize > outsize) {
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004284 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004285 requiredsize = 2*outsize;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004286 if (widechar_resize(buf, bufsize, requiredsize) < 0) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004287 goto onError;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004288 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004289 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02004290 wcsncpy(*buf + *outpos, repwstr, repwlen);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004291 *outpos += repwlen;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004292 *endinpos = newpos;
4293 *inptr = *input + newpos;
4294
4295 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004296 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004297 return 0;
4298
Benjamin Peterson2b76ce62014-09-29 18:50:06 -04004299 overflow:
4300 PyErr_SetString(PyExc_OverflowError,
4301 "decoded result is too long for a Python string");
4302
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004303 onError:
4304 Py_XDECREF(restuple);
4305 return -1;
4306}
Steve Dowercc16be82016-09-08 10:35:16 -07004307#endif /* MS_WINDOWS */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004308
4309static int
4310unicode_decode_call_errorhandler_writer(
4311 const char *errors, PyObject **errorHandler,
4312 const char *encoding, const char *reason,
4313 const char **input, const char **inend, Py_ssize_t *startinpos,
4314 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4315 _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4316{
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004317 static const char *argparse = "Un;decoding error handler must return (str, int) tuple";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004318
4319 PyObject *restuple = NULL;
4320 PyObject *repunicode = NULL;
4321 Py_ssize_t insize;
4322 Py_ssize_t newpos;
Victor Stinner170ca6f2013-04-18 00:25:28 +02004323 Py_ssize_t replen;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004324 Py_ssize_t remain;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004325 PyObject *inputobj = NULL;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004326 int need_to_grow = 0;
4327 const char *new_inptr;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004328
4329 if (*errorHandler == NULL) {
4330 *errorHandler = PyCodec_LookupError(errors);
4331 if (*errorHandler == NULL)
4332 goto onError;
4333 }
4334
4335 make_decode_exception(exceptionObject,
4336 encoding,
4337 *input, *inend - *input,
4338 *startinpos, *endinpos,
4339 reason);
4340 if (*exceptionObject == NULL)
4341 goto onError;
4342
Petr Viktorinffd97532020-02-11 17:46:57 +01004343 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004344 if (restuple == NULL)
4345 goto onError;
4346 if (!PyTuple_Check(restuple)) {
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004347 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004348 goto onError;
4349 }
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004350 if (!PyArg_ParseTuple(restuple, argparse, &repunicode, &newpos))
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004351 goto onError;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004352
4353 /* Copy back the bytes variables, which might have been modified by the
4354 callback */
4355 inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4356 if (!inputobj)
4357 goto onError;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004358 remain = *inend - *input - *endinpos;
Christian Heimes72b710a2008-05-26 13:28:38 +00004359 *input = PyBytes_AS_STRING(inputobj);
4360 insize = PyBytes_GET_SIZE(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004361 *inend = *input + insize;
Walter Dörwald36f938f2007-08-10 10:11:43 +00004362 /* we can DECREF safely, as the exception has another reference,
4363 so the object won't go away. */
4364 Py_DECREF(inputobj);
Walter Dörwalde78178e2007-07-30 13:31:40 +00004365
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004366 if (newpos<0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004367 newpos = insize+newpos;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004368 if (newpos<0 || newpos>insize) {
Victor Stinnera33bce02014-07-04 22:47:46 +02004369 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00004370 goto onError;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00004371 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004372
Victor Stinner170ca6f2013-04-18 00:25:28 +02004373 replen = PyUnicode_GET_LENGTH(repunicode);
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004374 if (replen > 1) {
4375 writer->min_length += replen - 1;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004376 need_to_grow = 1;
4377 }
4378 new_inptr = *input + newpos;
4379 if (*inend - new_inptr > remain) {
4380 /* We don't know the decoding algorithm here so we make the worst
4381 assumption that one byte decodes to one unicode character.
4382 If unfortunately one byte could decode to more unicode characters,
4383 the decoder may write out-of-bound then. Is it possible for the
4384 algorithms using this function? */
4385 writer->min_length += *inend - new_inptr - remain;
4386 need_to_grow = 1;
4387 }
4388 if (need_to_grow) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02004389 writer->overallocate = 1;
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02004390 if (_PyUnicodeWriter_Prepare(writer, writer->min_length - writer->pos,
Serhiy Storchaka7e4b9052015-01-26 01:22:54 +02004391 PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4392 goto onError;
4393 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004394 if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
Victor Stinner376cfa12013-04-17 23:58:16 +02004395 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004396
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004397 *endinpos = newpos;
Xiang Zhang2c7fd462018-01-31 20:48:05 +08004398 *inptr = new_inptr;
Walter Dörwalde78178e2007-07-30 13:31:40 +00004399
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004400 /* we made it! */
Serhiy Storchaka523c4492016-10-22 23:18:31 +03004401 Py_DECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004402 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004403
Benjamin Peterson29060642009-01-31 22:14:21 +00004404 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004405 Py_XDECREF(restuple);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004406 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004407}
4408
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004409/* --- UTF-7 Codec -------------------------------------------------------- */
4410
Antoine Pitrou244651a2009-05-04 18:56:13 +00004411/* See RFC2152 for details. We encode conservatively and decode liberally. */
4412
4413/* Three simple macros defining base-64. */
4414
4415/* Is c a base-64 character? */
4416
4417#define IS_BASE64(c) \
4418 (((c) >= 'A' && (c) <= 'Z') || \
4419 ((c) >= 'a' && (c) <= 'z') || \
4420 ((c) >= '0' && (c) <= '9') || \
4421 (c) == '+' || (c) == '/')
4422
4423/* given that c is a base-64 character, what is its base-64 value? */
4424
4425#define FROM_BASE64(c) \
4426 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
4427 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
4428 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
4429 (c) == '+' ? 62 : 63)
4430
4431/* What is the base-64 character of the bottom 6 bits of n? */
4432
4433#define TO_BASE64(n) \
4434 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4435
4436/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4437 * decoded as itself. We are permissive on decoding; the only ASCII
4438 * byte not decoding to itself is the + which begins a base64
4439 * string. */
4440
4441#define DECODE_DIRECT(c) \
4442 ((c) <= 127 && (c) != '+')
4443
4444/* The UTF-7 encoder treats ASCII characters differently according to
4445 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4446 * the above). See RFC2152. This array identifies these different
4447 * sets:
4448 * 0 : "Set D"
4449 * alphanumeric and '(),-./:?
4450 * 1 : "Set O"
4451 * !"#$%&*;<=>@[]^_`{|}
4452 * 2 : "whitespace"
4453 * ht nl cr sp
4454 * 3 : special (must be base64 encoded)
4455 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4456 */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004457
Tim Petersced69f82003-09-16 20:30:58 +00004458static
Antoine Pitrou244651a2009-05-04 18:56:13 +00004459char utf7_category[128] = {
4460/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
4461 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
4462/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
4463 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
4464/* sp ! " # $ % & ' ( ) * + , - . / */
4465 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
4466/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
4467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
4468/* @ A B C D E F G H I J K L M N O */
4469 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4470/* P Q R S T U V W X Y Z [ \ ] ^ _ */
4471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
4472/* ` a b c d e f g h i j k l m n o */
4473 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
4474/* p q r s t u v w x y z { | } ~ del */
4475 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004476};
4477
Antoine Pitrou244651a2009-05-04 18:56:13 +00004478/* ENCODE_DIRECT: this character should be encoded as itself. The
4479 * answer depends on whether we are encoding set O as itself, and also
4480 * on whether we are encoding whitespace as itself. RFC2152 makes it
4481 * clear that the answers to these questions vary between
4482 * applications, so this code needs to be flexible. */
Marc-André Lemburge115ec82005-10-19 22:33:31 +00004483
Antoine Pitrou244651a2009-05-04 18:56:13 +00004484#define ENCODE_DIRECT(c, directO, directWS) \
4485 ((c) < 128 && (c) > 0 && \
4486 ((utf7_category[(c)] == 0) || \
4487 (directWS && (utf7_category[(c)] == 2)) || \
4488 (directO && (utf7_category[(c)] == 1))))
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004489
Alexander Belopolsky40018472011-02-26 01:02:56 +00004490PyObject *
4491PyUnicode_DecodeUTF7(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004492 Py_ssize_t size,
4493 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004494{
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004495 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4496}
4497
Antoine Pitrou244651a2009-05-04 18:56:13 +00004498/* The decoder. The only state we preserve is our read position,
4499 * i.e. how many characters we have consumed. So if we end in the
4500 * middle of a shift sequence we have to back off the read position
4501 * and the output to the beginning of the sequence, otherwise we lose
4502 * all the shift state (seen bits, number of bits seen, high
4503 * surrogate). */
4504
Alexander Belopolsky40018472011-02-26 01:02:56 +00004505PyObject *
4506PyUnicode_DecodeUTF7Stateful(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004507 Py_ssize_t size,
4508 const char *errors,
4509 Py_ssize_t *consumed)
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004510{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004511 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00004512 Py_ssize_t startinpos;
4513 Py_ssize_t endinpos;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004514 const char *e;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004515 _PyUnicodeWriter writer;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004516 const char *errmsg = "";
4517 int inShift = 0;
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004518 Py_ssize_t shiftOutStart;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004519 unsigned int base64bits = 0;
4520 unsigned long base64buffer = 0;
Victor Stinner24729f32011-11-10 20:31:37 +01004521 Py_UCS4 surrogate = 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004522 PyObject *errorHandler = NULL;
4523 PyObject *exc = NULL;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004524
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004525 if (size == 0) {
4526 if (consumed)
4527 *consumed = 0;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02004528 _Py_RETURN_UNICODE_EMPTY();
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004529 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004530
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004531 /* Start off assuming it's all ASCII. Widen later as necessary. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02004532 _PyUnicodeWriter_Init(&writer);
4533 writer.min_length = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004534
4535 shiftOutStart = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004536 e = s + size;
4537
4538 while (s < e) {
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004539 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00004540 restart:
Antoine Pitrou5ffd9e92008-07-25 18:05:24 +00004541 ch = (unsigned char) *s;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004542
Antoine Pitrou244651a2009-05-04 18:56:13 +00004543 if (inShift) { /* in a base-64 section */
4544 if (IS_BASE64(ch)) { /* consume a base-64 character */
4545 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4546 base64bits += 6;
4547 s++;
4548 if (base64bits >= 16) {
4549 /* we have enough bits for a UTF-16 value */
Victor Stinner24729f32011-11-10 20:31:37 +01004550 Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
Antoine Pitrou244651a2009-05-04 18:56:13 +00004551 base64bits -= 16;
4552 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004553 assert(outCh <= 0xffff);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004554 if (surrogate) {
4555 /* expecting a second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004556 if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4557 Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004558 if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004559 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004560 surrogate = 0;
Antoine Pitrou5418ee02011-11-15 01:42:21 +01004561 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004562 }
4563 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004564 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
Antoine Pitrou78edf752011-11-15 01:44:16 +01004565 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004566 surrogate = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004567 }
4568 }
Victor Stinner551ac952011-11-29 22:58:13 +01004569 if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004570 /* first surrogate */
4571 surrogate = outCh;
4572 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004573 else {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004574 if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004575 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004576 }
4577 }
4578 }
4579 else { /* now leaving a base-64 section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004580 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004581 if (base64bits > 0) { /* left-over bits */
4582 if (base64bits >= 6) {
4583 /* We've seen at least one base-64 character */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004584 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004585 errmsg = "partial character in shift sequence";
4586 goto utf7Error;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004587 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004588 else {
4589 /* Some bits remain; they should be zero */
4590 if (base64buffer != 0) {
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004591 s++;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004592 errmsg = "non-zero padding bits in shift sequence";
4593 goto utf7Error;
4594 }
4595 }
4596 }
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004597 if (surrogate && DECODE_DIRECT(ch)) {
4598 if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4599 goto onError;
4600 }
4601 surrogate = 0;
4602 if (ch == '-') {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004603 /* '-' is absorbed; other terminating
4604 characters are preserved */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004605 s++;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004606 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004607 }
4608 }
4609 else if ( ch == '+' ) {
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004610 startinpos = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004611 s++; /* consume '+' */
4612 if (s < e && *s == '-') { /* '+-' encodes '+' */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004613 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004614 if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01004615 goto onError;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004616 }
Zackery Spytze349bf22018-08-18 22:43:38 -06004617 else if (s < e && !IS_BASE64(*s)) {
4618 s++;
4619 errmsg = "ill-formed sequence";
4620 goto utf7Error;
4621 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004622 else { /* begin base64-encoded section */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004623 inShift = 1;
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004624 surrogate = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004625 shiftOutStart = writer.pos;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004626 base64bits = 0;
Serhiy Storchaka35804e42013-10-19 20:38:19 +03004627 base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004628 }
4629 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004630 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004631 s++;
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02004632 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004633 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004634 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004635 else {
4636 startinpos = s-starts;
4637 s++;
4638 errmsg = "unexpected special character";
4639 goto utf7Error;
4640 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004641 continue;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004642utf7Error:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004643 endinpos = s-starts;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004644 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00004645 errors, &errorHandler,
4646 "utf7", errmsg,
4647 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004648 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00004649 goto onError;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004650 }
4651
Antoine Pitrou244651a2009-05-04 18:56:13 +00004652 /* end of string */
4653
4654 if (inShift && !consumed) { /* in shift sequence, no more to follow */
4655 /* if we're in an inconsistent state, that's an error */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +03004656 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004657 if (surrogate ||
4658 (base64bits >= 6) ||
4659 (base64bits > 0 && base64buffer != 0)) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004660 endinpos = size;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004661 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrou244651a2009-05-04 18:56:13 +00004662 errors, &errorHandler,
4663 "utf7", "unterminated shift sequence",
4664 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004665 &writer))
Antoine Pitrou244651a2009-05-04 18:56:13 +00004666 goto onError;
4667 if (s < e)
4668 goto restart;
4669 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004670 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004671
4672 /* return state */
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004673 if (consumed) {
Antoine Pitrou244651a2009-05-04 18:56:13 +00004674 if (inShift) {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004675 *consumed = startinpos;
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004676 if (writer.pos != shiftOutStart && writer.maxchar > 127) {
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004677 PyObject *result = PyUnicode_FromKindAndData(
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004678 writer.kind, writer.data, shiftOutStart);
4679 Py_XDECREF(errorHandler);
4680 Py_XDECREF(exc);
4681 _PyUnicodeWriter_Dealloc(&writer);
4682 return result;
Serhiy Storchaka016a3f32014-02-08 14:01:29 +02004683 }
Serhiy Storchaka6cbf1512014-02-08 14:06:33 +02004684 writer.pos = shiftOutStart; /* back off output */
Antoine Pitrou244651a2009-05-04 18:56:13 +00004685 }
4686 else {
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004687 *consumed = s-starts;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004688 }
Christian Heimes5d14c2b2007-11-20 23:38:09 +00004689 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004690
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004691 Py_XDECREF(errorHandler);
4692 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004693 return _PyUnicodeWriter_Finish(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004694
Benjamin Peterson29060642009-01-31 22:14:21 +00004695 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00004696 Py_XDECREF(errorHandler);
4697 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004698 _PyUnicodeWriter_Dealloc(&writer);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004699 return NULL;
4700}
4701
4702
Alexander Belopolsky40018472011-02-26 01:02:56 +00004703PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004704_PyUnicode_EncodeUTF7(PyObject *str,
4705 int base64SetO,
4706 int base64WhiteSpace,
4707 const char *errors)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004708{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004709 int kind;
4710 void *data;
4711 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004712 PyObject *v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004713 int inShift = 0;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004714 Py_ssize_t i;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004715 unsigned int base64bits = 0;
4716 unsigned long base64buffer = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004717 char * out;
4718 char * start;
4719
Benjamin Petersonbac79492012-01-14 13:34:47 -05004720 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004721 return NULL;
4722 kind = PyUnicode_KIND(str);
4723 data = PyUnicode_DATA(str);
4724 len = PyUnicode_GET_LENGTH(str);
4725
4726 if (len == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00004727 return PyBytes_FromStringAndSize(NULL, 0);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004728
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004729 /* It might be possible to tighten this worst case */
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004730 if (len > PY_SSIZE_T_MAX / 8)
Neal Norwitz3ce5d922008-08-24 07:08:55 +00004731 return PyErr_NoMemory();
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01004732 v = PyBytes_FromStringAndSize(NULL, len * 8);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004733 if (v == NULL)
4734 return NULL;
4735
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004736 start = out = PyBytes_AS_STRING(v);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004737 for (i = 0; i < len; ++i) {
Victor Stinner0e368262011-11-10 20:12:49 +01004738 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004739
Antoine Pitrou244651a2009-05-04 18:56:13 +00004740 if (inShift) {
4741 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4742 /* shifting out */
4743 if (base64bits) { /* output remaining bits */
4744 *out++ = TO_BASE64(base64buffer << (6-base64bits));
4745 base64buffer = 0;
4746 base64bits = 0;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004747 }
4748 inShift = 0;
Antoine Pitrou244651a2009-05-04 18:56:13 +00004749 /* Characters not in the BASE64 set implicitly unshift the sequence
4750 so no '-' is required, except if the character is itself a '-' */
4751 if (IS_BASE64(ch) || ch == '-') {
4752 *out++ = '-';
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004753 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004754 *out++ = (char) ch;
4755 }
4756 else {
4757 goto encode_char;
Tim Petersced69f82003-09-16 20:30:58 +00004758 }
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004759 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004760 else { /* not in a shift sequence */
4761 if (ch == '+') {
4762 *out++ = '+';
4763 *out++ = '-';
4764 }
4765 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4766 *out++ = (char) ch;
4767 }
4768 else {
4769 *out++ = '+';
4770 inShift = 1;
4771 goto encode_char;
4772 }
4773 }
4774 continue;
4775encode_char:
Antoine Pitrou244651a2009-05-04 18:56:13 +00004776 if (ch >= 0x10000) {
Victor Stinner8faf8212011-12-08 22:14:11 +01004777 assert(ch <= MAX_UNICODE);
Victor Stinner0d3721d2011-11-22 03:27:53 +01004778
Antoine Pitrou244651a2009-05-04 18:56:13 +00004779 /* code first surrogate */
4780 base64bits += 16;
Victor Stinner76df43d2012-10-30 01:42:39 +01004781 base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004782 while (base64bits >= 6) {
4783 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4784 base64bits -= 6;
4785 }
4786 /* prepare second surrogate */
Victor Stinner551ac952011-11-29 22:58:13 +01004787 ch = Py_UNICODE_LOW_SURROGATE(ch);
Antoine Pitrou244651a2009-05-04 18:56:13 +00004788 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004789 base64bits += 16;
4790 base64buffer = (base64buffer << 16) | ch;
4791 while (base64bits >= 6) {
4792 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4793 base64bits -= 6;
4794 }
Hye-Shik Chang1bc09b72004-01-03 19:35:43 +00004795 }
Antoine Pitrou244651a2009-05-04 18:56:13 +00004796 if (base64bits)
4797 *out++= TO_BASE64(base64buffer << (6-base64bits) );
4798 if (inShift)
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004799 *out++ = '-';
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00004800 if (_PyBytes_Resize(&v, out - start) < 0)
4801 return NULL;
4802 return v;
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004803}
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004804PyObject *
4805PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4806 Py_ssize_t size,
4807 int base64SetO,
4808 int base64WhiteSpace,
4809 const char *errors)
4810{
4811 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02004812 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004813 if (tmp == NULL)
4814 return NULL;
Victor Stinner0e368262011-11-10 20:12:49 +01004815 result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
Martin v. Löwis1db7c132011-11-10 18:24:32 +01004816 base64WhiteSpace, errors);
4817 Py_DECREF(tmp);
4818 return result;
4819}
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004820
Antoine Pitrou244651a2009-05-04 18:56:13 +00004821#undef IS_BASE64
4822#undef FROM_BASE64
4823#undef TO_BASE64
4824#undef DECODE_DIRECT
4825#undef ENCODE_DIRECT
Marc-André Lemburgc60e6f72001-09-20 10:35:46 +00004826
Guido van Rossumd57fd912000-03-10 22:53:23 +00004827/* --- UTF-8 Codec -------------------------------------------------------- */
4828
Alexander Belopolsky40018472011-02-26 01:02:56 +00004829PyObject *
4830PyUnicode_DecodeUTF8(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03004831 Py_ssize_t size,
4832 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00004833{
Walter Dörwald69652032004-09-07 20:24:22 +00004834 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4835}
4836
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004837#include "stringlib/asciilib.h"
4838#include "stringlib/codecs.h"
4839#include "stringlib/undef.h"
4840
Antoine Pitrou0a3229d2011-11-21 20:39:13 +01004841#include "stringlib/ucs1lib.h"
4842#include "stringlib/codecs.h"
4843#include "stringlib/undef.h"
4844
4845#include "stringlib/ucs2lib.h"
4846#include "stringlib/codecs.h"
4847#include "stringlib/undef.h"
4848
4849#include "stringlib/ucs4lib.h"
4850#include "stringlib/codecs.h"
4851#include "stringlib/undef.h"
4852
Antoine Pitrouab868312009-01-10 15:40:25 +00004853/* Mask to quickly check whether a C 'long' contains a
4854 non-ASCII, UTF8-encoded char. */
4855#if (SIZEOF_LONG == 8)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004856# define ASCII_CHAR_MASK 0x8080808080808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004857#elif (SIZEOF_LONG == 4)
Mark Dickinson01ac8b62012-07-07 14:08:48 +02004858# define ASCII_CHAR_MASK 0x80808080UL
Antoine Pitrouab868312009-01-10 15:40:25 +00004859#else
4860# error C 'long' size should be either 4 or 8!
4861#endif
4862
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004863static Py_ssize_t
4864ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004865{
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004866 const char *p = start;
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004867 const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02004868
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004869 /*
4870 * Issue #17237: m68k is a bit different from most architectures in
4871 * that objects do not use "natural alignment" - for example, int and
4872 * long are only aligned at 2-byte boundaries. Therefore the assert()
4873 * won't work; also, tests have shown that skipping the "optimised
4874 * version" will even speed up m68k.
4875 */
4876#if !defined(__m68k__)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004877#if SIZEOF_LONG <= SIZEOF_VOID_P
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004878 assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4879 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004880 /* Fast path, see in STRINGLIB(utf8_decode) for
4881 an explanation. */
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004882 /* Help allocation */
4883 const char *_p = p;
4884 Py_UCS1 * q = dest;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004885 while (_p < aligned_end) {
4886 unsigned long value = *(const unsigned long *) _p;
4887 if (value & ASCII_CHAR_MASK)
Benjamin Peterson29060642009-01-31 22:14:21 +00004888 break;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004889 *((unsigned long *)q) = value;
4890 _p += SIZEOF_LONG;
4891 q += SIZEOF_LONG;
Benjamin Peterson14339b62009-01-31 16:36:08 +00004892 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004893 p = _p;
4894 while (p < end) {
4895 if ((unsigned char)*p & 0x80)
4896 break;
4897 *q++ = *p++;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004898 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004899 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004900 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004901#endif
Antoine Pitrou8b0e9842013-05-11 15:58:34 +02004902#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004903 while (p < end) {
4904 /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4905 for an explanation. */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02004906 if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02004907 /* Help allocation */
4908 const char *_p = p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004909 while (_p < aligned_end) {
Andy Lestere6be9b52020-02-11 20:28:35 -06004910 unsigned long value = *(const unsigned long *) _p;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004911 if (value & ASCII_CHAR_MASK)
4912 break;
4913 _p += SIZEOF_LONG;
4914 }
4915 p = _p;
4916 if (_p == end)
4917 break;
4918 }
4919 if ((unsigned char)*p & 0x80)
4920 break;
4921 ++p;
4922 }
4923 memcpy(dest, start, p - start);
4924 return p - start;
Guido van Rossumd57fd912000-03-10 22:53:23 +00004925}
Antoine Pitrouab868312009-01-10 15:40:25 +00004926
Victor Stinner709d23d2019-05-02 14:56:30 -04004927static PyObject *
4928unicode_decode_utf8(const char *s, Py_ssize_t size,
4929 _Py_error_handler error_handler, const char *errors,
4930 Py_ssize_t *consumed)
Victor Stinner785938e2011-12-11 20:09:03 +01004931{
Victor Stinner785938e2011-12-11 20:09:03 +01004932 if (size == 0) {
4933 if (consumed)
4934 *consumed = 0;
Serhiy Storchaka678db842013-01-26 12:16:36 +02004935 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner785938e2011-12-11 20:09:03 +01004936 }
4937
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004938 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4939 if (size == 1 && (unsigned char)s[0] < 128) {
Victor Stinner785938e2011-12-11 20:09:03 +01004940 if (consumed)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004941 *consumed = 1;
4942 return get_latin1_char((unsigned char)s[0]);
Victor Stinner785938e2011-12-11 20:09:03 +01004943 }
4944
Inada Naoki770847a2019-06-24 12:30:24 +09004945 const char *starts = s;
4946 const char *end = s + size;
Victor Stinner785938e2011-12-11 20:09:03 +01004947
Inada Naoki770847a2019-06-24 12:30:24 +09004948 // fast path: try ASCII string.
4949 PyObject *u = PyUnicode_New(size, 127);
4950 if (u == NULL) {
4951 return NULL;
4952 }
4953 s += ascii_decode(s, end, PyUnicode_DATA(u));
4954 if (s == end) {
4955 return u;
4956 }
4957
4958 // Use _PyUnicodeWriter after fast path is failed.
4959 _PyUnicodeWriter writer;
4960 _PyUnicodeWriter_InitWithBuffer(&writer, u);
4961 writer.pos = s - starts;
4962
4963 Py_ssize_t startinpos, endinpos;
4964 const char *errmsg = "";
4965 PyObject *error_handler_obj = NULL;
4966 PyObject *exc = NULL;
4967
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004968 while (s < end) {
4969 Py_UCS4 ch;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004970 int kind = writer.kind;
Victor Stinner1d65d912015-10-05 13:43:50 +02004971
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004972 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004973 if (PyUnicode_IS_ASCII(writer.buffer))
4974 ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004975 else
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004976 ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004977 } else if (kind == PyUnicode_2BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004978 ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004979 } else {
4980 assert(kind == PyUnicode_4BYTE_KIND);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01004981 ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004982 }
4983
4984 switch (ch) {
4985 case 0:
4986 if (s == end || consumed)
4987 goto End;
4988 errmsg = "unexpected end of data";
4989 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02004990 endinpos = end - starts;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02004991 break;
4992 case 1:
4993 errmsg = "invalid start byte";
4994 startinpos = s - starts;
4995 endinpos = startinpos + 1;
4996 break;
4997 case 2:
Serhiy Storchaka894263b2019-06-25 11:54:18 +03004998 if (consumed && (unsigned char)s[0] == 0xED && end - s == 2
4999 && (unsigned char)s[1] >= 0xA0 && (unsigned char)s[1] <= 0xBF)
5000 {
5001 /* Truncated surrogate code in range D800-DFFF */
Serhiy Storchaka7a465cb2019-03-30 08:23:38 +02005002 goto End;
5003 }
Serhiy Storchaka894263b2019-06-25 11:54:18 +03005004 /* fall through */
5005 case 3:
5006 case 4:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005007 errmsg = "invalid continuation byte";
5008 startinpos = s - starts;
Ezio Melottif7ed5d12012-11-04 23:21:38 +02005009 endinpos = startinpos + ch - 1;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005010 break;
5011 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005012 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005013 goto onError;
5014 continue;
5015 }
5016
Victor Stinner1d65d912015-10-05 13:43:50 +02005017 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02005018 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner1d65d912015-10-05 13:43:50 +02005019
5020 switch (error_handler) {
5021 case _Py_ERROR_IGNORE:
5022 s += (endinpos - startinpos);
5023 break;
5024
5025 case _Py_ERROR_REPLACE:
5026 if (_PyUnicodeWriter_WriteCharInline(&writer, 0xfffd) < 0)
5027 goto onError;
5028 s += (endinpos - startinpos);
5029 break;
5030
5031 case _Py_ERROR_SURROGATEESCAPE:
Victor Stinner74e8fac2015-10-05 13:49:26 +02005032 {
5033 Py_ssize_t i;
5034
Victor Stinner1d65d912015-10-05 13:43:50 +02005035 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
5036 goto onError;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005037 for (i=startinpos; i<endinpos; i++) {
Victor Stinner1d65d912015-10-05 13:43:50 +02005038 ch = (Py_UCS4)(unsigned char)(starts[i]);
5039 PyUnicode_WRITE(writer.kind, writer.data, writer.pos,
5040 ch + 0xdc00);
5041 writer.pos++;
5042 }
5043 s += (endinpos - startinpos);
5044 break;
Victor Stinner74e8fac2015-10-05 13:49:26 +02005045 }
Victor Stinner1d65d912015-10-05 13:43:50 +02005046
5047 default:
5048 if (unicode_decode_call_errorhandler_writer(
5049 errors, &error_handler_obj,
5050 "utf-8", errmsg,
5051 &starts, &end, &startinpos, &endinpos, &exc, &s,
5052 &writer))
5053 goto onError;
5054 }
Victor Stinner785938e2011-12-11 20:09:03 +01005055 }
5056
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005057End:
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005058 if (consumed)
5059 *consumed = s - starts;
5060
Victor Stinner1d65d912015-10-05 13:43:50 +02005061 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005062 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005063 return _PyUnicodeWriter_Finish(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005064
5065onError:
Victor Stinner1d65d912015-10-05 13:43:50 +02005066 Py_XDECREF(error_handler_obj);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005067 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005068 _PyUnicodeWriter_Dealloc(&writer);
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005069 return NULL;
Victor Stinner785938e2011-12-11 20:09:03 +01005070}
5071
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005072
Victor Stinner709d23d2019-05-02 14:56:30 -04005073PyObject *
5074PyUnicode_DecodeUTF8Stateful(const char *s,
5075 Py_ssize_t size,
5076 const char *errors,
5077 Py_ssize_t *consumed)
5078{
5079 return unicode_decode_utf8(s, size, _Py_ERROR_UNKNOWN, errors, consumed);
5080}
5081
5082
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005083/* UTF-8 decoder: use surrogateescape error handler if 'surrogateescape' is
5084 non-zero, use strict error handler otherwise.
Victor Stinner0d92c4f2012-11-12 23:32:21 +01005085
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005086 On success, write a pointer to a newly allocated wide character string into
5087 *wstr (use PyMem_RawFree() to free the memory) and write the output length
5088 (in number of wchar_t units) into *wlen (if wlen is set).
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005089
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005090 On memory allocation failure, return -1.
5091
5092 On decoding error (if surrogateescape is zero), return -2. If wlen is
5093 non-NULL, write the start of the illegal byte sequence into *wlen. If reason
5094 is not NULL, write the decoding error message into *reason. */
5095int
5096_Py_DecodeUTF8Ex(const char *s, Py_ssize_t size, wchar_t **wstr, size_t *wlen,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005097 const char **reason, _Py_error_handler errors)
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005098{
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005099 const char *orig_s = s;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005100 const char *e;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005101 wchar_t *unicode;
5102 Py_ssize_t outpos;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005103
Victor Stinner3d4226a2018-08-29 22:21:32 +02005104 int surrogateescape = 0;
5105 int surrogatepass = 0;
5106 switch (errors)
5107 {
5108 case _Py_ERROR_STRICT:
5109 break;
5110 case _Py_ERROR_SURROGATEESCAPE:
5111 surrogateescape = 1;
5112 break;
5113 case _Py_ERROR_SURROGATEPASS:
5114 surrogatepass = 1;
5115 break;
5116 default:
5117 return -3;
5118 }
5119
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005120 /* Note: size will always be longer than the resulting Unicode
5121 character count */
Victor Stinner91106cd2017-12-13 12:29:09 +01005122 if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1)) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005123 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005124 }
5125
Victor Stinner6f8eeee2013-07-07 22:57:45 +02005126 unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
Victor Stinner91106cd2017-12-13 12:29:09 +01005127 if (!unicode) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005128 return -1;
Victor Stinner91106cd2017-12-13 12:29:09 +01005129 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005130
5131 /* Unpack UTF-8 encoded data */
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005132 e = s + size;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005133 outpos = 0;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005134 while (s < e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005135 Py_UCS4 ch;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005136#if SIZEOF_WCHAR_T == 4
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005137 ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005138#else
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005139 ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005140#endif
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005141 if (ch > 0xFF) {
5142#if SIZEOF_WCHAR_T == 4
Barry Warsawb2e57942017-09-14 18:13:16 -07005143 Py_UNREACHABLE();
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005144#else
Serhiy Storchakab6266432016-11-12 14:28:06 +02005145 assert(ch > 0xFFFF && ch <= MAX_UNICODE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005146 /* write a surrogate pair */
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005147 unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
5148 unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
5149#endif
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005150 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005151 else {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005152 if (!ch && s == e) {
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005153 break;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005154 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005155
5156 if (surrogateescape) {
5157 unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
5158 }
5159 else {
5160 /* Is it a valid three-byte code? */
5161 if (surrogatepass
5162 && (e - s) >= 3
5163 && (s[0] & 0xf0) == 0xe0
5164 && (s[1] & 0xc0) == 0x80
5165 && (s[2] & 0xc0) == 0x80)
5166 {
5167 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
5168 s += 3;
5169 unicode[outpos++] = ch;
5170 }
5171 else {
5172 PyMem_RawFree(unicode );
5173 if (reason != NULL) {
5174 switch (ch) {
5175 case 0:
5176 *reason = "unexpected end of data";
5177 break;
5178 case 1:
5179 *reason = "invalid start byte";
5180 break;
5181 /* 2, 3, 4 */
5182 default:
5183 *reason = "invalid continuation byte";
5184 break;
5185 }
5186 }
5187 if (wlen != NULL) {
5188 *wlen = s - orig_s;
5189 }
5190 return -2;
5191 }
5192 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005193 }
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005194 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02005195 unicode[outpos] = L'\0';
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005196 if (wlen) {
5197 *wlen = outpos;
Victor Stinner91106cd2017-12-13 12:29:09 +01005198 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005199 *wstr = unicode;
5200 return 0;
5201}
5202
Victor Stinner5f9cf232019-03-19 01:46:25 +01005203
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005204wchar_t*
Victor Stinner5f9cf232019-03-19 01:46:25 +01005205_Py_DecodeUTF8_surrogateescape(const char *arg, Py_ssize_t arglen,
5206 size_t *wlen)
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005207{
5208 wchar_t *wstr;
Victor Stinner5f9cf232019-03-19 01:46:25 +01005209 int res = _Py_DecodeUTF8Ex(arg, arglen,
5210 &wstr, wlen,
5211 NULL, _Py_ERROR_SURROGATEESCAPE);
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005212 if (res != 0) {
Victor Stinner5f9cf232019-03-19 01:46:25 +01005213 /* _Py_DecodeUTF8Ex() must support _Py_ERROR_SURROGATEESCAPE */
5214 assert(res != -3);
5215 if (wlen) {
5216 *wlen = (size_t)res;
5217 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005218 return NULL;
5219 }
5220 return wstr;
Victor Stinnerf933e1a2010-10-20 22:58:25 +00005221}
5222
Antoine Pitrouab868312009-01-10 15:40:25 +00005223
Victor Stinnere47e6982017-12-21 15:45:16 +01005224/* UTF-8 encoder using the surrogateescape error handler .
5225
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005226 On success, return 0 and write the newly allocated character string (use
5227 PyMem_Free() to free the memory) into *str.
Victor Stinnere47e6982017-12-21 15:45:16 +01005228
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005229 On encoding failure, return -2 and write the position of the invalid
5230 surrogate character into *error_pos (if error_pos is set) and the decoding
5231 error message into *reason (if reason is set).
Victor Stinnere47e6982017-12-21 15:45:16 +01005232
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005233 On memory allocation failure, return -1. */
5234int
5235_Py_EncodeUTF8Ex(const wchar_t *text, char **str, size_t *error_pos,
Victor Stinner3d4226a2018-08-29 22:21:32 +02005236 const char **reason, int raw_malloc, _Py_error_handler errors)
Victor Stinnere47e6982017-12-21 15:45:16 +01005237{
5238 const Py_ssize_t max_char_size = 4;
5239 Py_ssize_t len = wcslen(text);
5240
5241 assert(len >= 0);
5242
Victor Stinner3d4226a2018-08-29 22:21:32 +02005243 int surrogateescape = 0;
5244 int surrogatepass = 0;
5245 switch (errors)
5246 {
5247 case _Py_ERROR_STRICT:
5248 break;
5249 case _Py_ERROR_SURROGATEESCAPE:
5250 surrogateescape = 1;
5251 break;
5252 case _Py_ERROR_SURROGATEPASS:
5253 surrogatepass = 1;
5254 break;
5255 default:
5256 return -3;
5257 }
5258
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005259 if (len > PY_SSIZE_T_MAX / max_char_size - 1) {
5260 return -1;
5261 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005262 char *bytes;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005263 if (raw_malloc) {
5264 bytes = PyMem_RawMalloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005265 }
5266 else {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005267 bytes = PyMem_Malloc((len + 1) * max_char_size);
Victor Stinnere47e6982017-12-21 15:45:16 +01005268 }
5269 if (bytes == NULL) {
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005270 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005271 }
5272
5273 char *p = bytes;
5274 Py_ssize_t i;
Victor Stinner3d4226a2018-08-29 22:21:32 +02005275 for (i = 0; i < len; ) {
5276 Py_ssize_t ch_pos = i;
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005277 Py_UCS4 ch = text[i];
Victor Stinner3d4226a2018-08-29 22:21:32 +02005278 i++;
5279#if Py_UNICODE_SIZE == 2
5280 if (Py_UNICODE_IS_HIGH_SURROGATE(ch)
5281 && i < len
5282 && Py_UNICODE_IS_LOW_SURROGATE(text[i]))
5283 {
5284 ch = Py_UNICODE_JOIN_SURROGATES(ch, text[i]);
5285 i++;
5286 }
5287#endif
Victor Stinnere47e6982017-12-21 15:45:16 +01005288
5289 if (ch < 0x80) {
5290 /* Encode ASCII */
5291 *p++ = (char) ch;
5292
5293 }
5294 else if (ch < 0x0800) {
5295 /* Encode Latin-1 */
5296 *p++ = (char)(0xc0 | (ch >> 6));
5297 *p++ = (char)(0x80 | (ch & 0x3f));
5298 }
Victor Stinner3d4226a2018-08-29 22:21:32 +02005299 else if (Py_UNICODE_IS_SURROGATE(ch) && !surrogatepass) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005300 /* surrogateescape error handler */
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005301 if (!surrogateescape || !(0xDC80 <= ch && ch <= 0xDCFF)) {
Victor Stinnere47e6982017-12-21 15:45:16 +01005302 if (error_pos != NULL) {
Victor Stinner3d4226a2018-08-29 22:21:32 +02005303 *error_pos = (size_t)ch_pos;
Victor Stinnere47e6982017-12-21 15:45:16 +01005304 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005305 if (reason != NULL) {
5306 *reason = "encoding error";
5307 }
5308 if (raw_malloc) {
5309 PyMem_RawFree(bytes);
5310 }
5311 else {
5312 PyMem_Free(bytes);
5313 }
5314 return -2;
Victor Stinnere47e6982017-12-21 15:45:16 +01005315 }
5316 *p++ = (char)(ch & 0xff);
5317 }
5318 else if (ch < 0x10000) {
5319 *p++ = (char)(0xe0 | (ch >> 12));
5320 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5321 *p++ = (char)(0x80 | (ch & 0x3f));
5322 }
5323 else { /* ch >= 0x10000 */
5324 assert(ch <= MAX_UNICODE);
5325 /* Encode UCS4 Unicode ordinals */
5326 *p++ = (char)(0xf0 | (ch >> 18));
5327 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
5328 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
5329 *p++ = (char)(0x80 | (ch & 0x3f));
5330 }
5331 }
5332 *p++ = '\0';
5333
5334 size_t final_size = (p - bytes);
Victor Stinner9dd76202017-12-21 16:20:32 +01005335 char *bytes2;
5336 if (raw_malloc) {
5337 bytes2 = PyMem_RawRealloc(bytes, final_size);
5338 }
5339 else {
5340 bytes2 = PyMem_Realloc(bytes, final_size);
5341 }
Victor Stinnere47e6982017-12-21 15:45:16 +01005342 if (bytes2 == NULL) {
5343 if (error_pos != NULL) {
5344 *error_pos = (size_t)-1;
5345 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005346 if (raw_malloc) {
5347 PyMem_RawFree(bytes);
5348 }
5349 else {
5350 PyMem_Free(bytes);
5351 }
5352 return -1;
Victor Stinnere47e6982017-12-21 15:45:16 +01005353 }
Victor Stinner7ed7aea2018-01-15 10:45:49 +01005354 *str = bytes2;
5355 return 0;
Victor Stinnere47e6982017-12-21 15:45:16 +01005356}
5357
5358
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005359/* Primary internal function which creates utf8 encoded bytes objects.
5360
5361 Allocation strategy: if the string is short, convert into a stack buffer
Tim Peters602f7402002-04-27 18:03:26 +00005362 and allocate exactly as much space needed at the end. Else allocate the
5363 maximum possible needed (4 result bytes per Unicode character), and return
5364 the excess memory at the end.
Martin v. Löwis2a7ff352002-04-21 09:59:45 +00005365*/
Victor Stinner709d23d2019-05-02 14:56:30 -04005366static PyObject *
5367unicode_encode_utf8(PyObject *unicode, _Py_error_handler error_handler,
5368 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005369{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005370 if (!PyUnicode_Check(unicode)) {
5371 PyErr_BadArgument();
5372 return NULL;
5373 }
5374
5375 if (PyUnicode_READY(unicode) == -1)
5376 return NULL;
5377
Victor Stinnere90fe6a2011-10-01 16:48:13 +02005378 if (PyUnicode_UTF8(unicode))
5379 return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
5380 PyUnicode_UTF8_LENGTH(unicode));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005381
Inada Naoki02a4d572020-02-27 13:48:59 +09005382 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5383 void *data = PyUnicode_DATA(unicode);
5384 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5385
5386 _PyBytesWriter writer;
5387 char *end;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005388
Benjamin Petersonead6b532011-12-20 17:23:42 -06005389 switch (kind) {
Victor Stinner6099a032011-12-18 14:22:26 +01005390 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07005391 Py_UNREACHABLE();
Victor Stinner6099a032011-12-18 14:22:26 +01005392 case PyUnicode_1BYTE_KIND:
5393 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5394 assert(!PyUnicode_IS_ASCII(unicode));
Inada Naoki02a4d572020-02-27 13:48:59 +09005395 end = ucs1lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5396 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005397 case PyUnicode_2BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005398 end = ucs2lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5399 break;
Victor Stinner6099a032011-12-18 14:22:26 +01005400 case PyUnicode_4BYTE_KIND:
Inada Naoki02a4d572020-02-27 13:48:59 +09005401 end = ucs4lib_utf8_encoder(&writer, unicode, data, size, error_handler, errors);
5402 break;
Tim Peters602f7402002-04-27 18:03:26 +00005403 }
Inada Naoki02a4d572020-02-27 13:48:59 +09005404
5405 if (end == NULL) {
5406 _PyBytesWriter_Dealloc(&writer);
5407 return NULL;
5408 }
5409 return _PyBytesWriter_Finish(&writer, end);
5410}
5411
5412static int
5413unicode_fill_utf8(PyObject *unicode)
5414{
5415 /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
5416 assert(!PyUnicode_IS_ASCII(unicode));
5417
5418 enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
5419 void *data = PyUnicode_DATA(unicode);
5420 Py_ssize_t size = PyUnicode_GET_LENGTH(unicode);
5421
5422 _PyBytesWriter writer;
5423 char *end;
5424
5425 switch (kind) {
5426 default:
5427 Py_UNREACHABLE();
5428 case PyUnicode_1BYTE_KIND:
5429 end = ucs1lib_utf8_encoder(&writer, unicode, data, size,
5430 _Py_ERROR_STRICT, NULL);
5431 break;
5432 case PyUnicode_2BYTE_KIND:
5433 end = ucs2lib_utf8_encoder(&writer, unicode, data, size,
5434 _Py_ERROR_STRICT, NULL);
5435 break;
5436 case PyUnicode_4BYTE_KIND:
5437 end = ucs4lib_utf8_encoder(&writer, unicode, data, size,
5438 _Py_ERROR_STRICT, NULL);
5439 break;
5440 }
5441 if (end == NULL) {
5442 _PyBytesWriter_Dealloc(&writer);
5443 return -1;
5444 }
5445
5446 char *start = writer.use_small_buffer ? writer.small_buffer :
5447 PyBytes_AS_STRING(writer.buffer);
5448 Py_ssize_t len = end - start;
5449
5450 char *cache = PyObject_MALLOC(len + 1);
5451 if (cache == NULL) {
5452 _PyBytesWriter_Dealloc(&writer);
5453 PyErr_NoMemory();
5454 return -1;
5455 }
5456 _PyUnicode_UTF8(unicode) = cache;
5457 _PyUnicode_UTF8_LENGTH(unicode) = len;
5458 memcpy(cache, start, len);
5459 cache[len] = '\0';
5460 _PyBytesWriter_Dealloc(&writer);
5461 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005462}
5463
Alexander Belopolsky40018472011-02-26 01:02:56 +00005464PyObject *
Victor Stinner709d23d2019-05-02 14:56:30 -04005465_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
5466{
5467 return unicode_encode_utf8(unicode, _Py_ERROR_UNKNOWN, errors);
5468}
5469
5470
5471PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005472PyUnicode_EncodeUTF8(const Py_UNICODE *s,
5473 Py_ssize_t size,
5474 const char *errors)
5475{
5476 PyObject *v, *unicode;
5477
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005478 unicode = PyUnicode_FromWideChar(s, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005479 if (unicode == NULL)
5480 return NULL;
5481 v = _PyUnicode_AsUTF8String(unicode, errors);
5482 Py_DECREF(unicode);
5483 return v;
5484}
5485
5486PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005487PyUnicode_AsUTF8String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005488{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02005489 return _PyUnicode_AsUTF8String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005490}
5491
Walter Dörwald41980ca2007-08-16 21:55:45 +00005492/* --- UTF-32 Codec ------------------------------------------------------- */
5493
5494PyObject *
5495PyUnicode_DecodeUTF32(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005496 Py_ssize_t size,
5497 const char *errors,
5498 int *byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005499{
5500 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
5501}
5502
5503PyObject *
5504PyUnicode_DecodeUTF32Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005505 Py_ssize_t size,
5506 const char *errors,
5507 int *byteorder,
5508 Py_ssize_t *consumed)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005509{
5510 const char *starts = s;
5511 Py_ssize_t startinpos;
5512 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005513 _PyUnicodeWriter writer;
Mark Dickinson7db923c2010-06-12 09:10:14 +00005514 const unsigned char *q, *e;
Victor Stinnere64322e2012-10-30 23:12:47 +01005515 int le, bo = 0; /* assume native ordering by default */
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005516 const char *encoding;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005517 const char *errmsg = "";
Walter Dörwald41980ca2007-08-16 21:55:45 +00005518 PyObject *errorHandler = NULL;
5519 PyObject *exc = NULL;
Victor Stinner313a1202010-06-11 23:56:51 +00005520
Andy Lestere6be9b52020-02-11 20:28:35 -06005521 q = (const unsigned char *)s;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005522 e = q + size;
5523
5524 if (byteorder)
5525 bo = *byteorder;
5526
5527 /* Check for BOM marks (U+FEFF) in the input and adjust current
5528 byte order setting accordingly. In native mode, the leading BOM
5529 mark is skipped, in all other modes, it is copied to the output
5530 stream as-is (giving a ZWNBSP character). */
Victor Stinnere64322e2012-10-30 23:12:47 +01005531 if (bo == 0 && size >= 4) {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005532 Py_UCS4 bom = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005533 if (bom == 0x0000FEFF) {
5534 bo = -1;
5535 q += 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005536 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005537 else if (bom == 0xFFFE0000) {
5538 bo = 1;
5539 q += 4;
5540 }
5541 if (byteorder)
5542 *byteorder = bo;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005543 }
5544
Victor Stinnere64322e2012-10-30 23:12:47 +01005545 if (q == e) {
5546 if (consumed)
5547 *consumed = size;
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02005548 _Py_RETURN_UNICODE_EMPTY();
Walter Dörwald41980ca2007-08-16 21:55:45 +00005549 }
5550
Victor Stinnere64322e2012-10-30 23:12:47 +01005551#ifdef WORDS_BIGENDIAN
5552 le = bo < 0;
5553#else
5554 le = bo <= 0;
5555#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005556 encoding = le ? "utf-32-le" : "utf-32-be";
Victor Stinnere64322e2012-10-30 23:12:47 +01005557
Victor Stinner8f674cc2013-04-17 23:02:17 +02005558 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005559 writer.min_length = (e - q + 3) / 4;
5560 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005561 goto onError;
Victor Stinnere64322e2012-10-30 23:12:47 +01005562
Victor Stinnere64322e2012-10-30 23:12:47 +01005563 while (1) {
5564 Py_UCS4 ch = 0;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005565 Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
Antoine Pitroucc0cfd32010-06-11 21:46:32 +00005566
Victor Stinnere64322e2012-10-30 23:12:47 +01005567 if (e - q >= 4) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005568 enum PyUnicode_Kind kind = writer.kind;
5569 void *data = writer.data;
Victor Stinnere64322e2012-10-30 23:12:47 +01005570 const unsigned char *last = e - 4;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005571 Py_ssize_t pos = writer.pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005572 if (le) {
5573 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005574 ch = ((unsigned int)q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
Victor Stinnere64322e2012-10-30 23:12:47 +01005575 if (ch > maxch)
5576 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005577 if (kind != PyUnicode_1BYTE_KIND &&
5578 Py_UNICODE_IS_SURROGATE(ch))
5579 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005580 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005581 q += 4;
5582 } while (q <= last);
5583 }
5584 else {
5585 do {
Benjamin Peterson33d2a492016-09-06 20:40:04 -07005586 ch = ((unsigned int)q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
Victor Stinnere64322e2012-10-30 23:12:47 +01005587 if (ch > maxch)
5588 break;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005589 if (kind != PyUnicode_1BYTE_KIND &&
5590 Py_UNICODE_IS_SURROGATE(ch))
5591 break;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005592 PyUnicode_WRITE(kind, data, pos++, ch);
Victor Stinnere64322e2012-10-30 23:12:47 +01005593 q += 4;
5594 } while (q <= last);
5595 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005596 writer.pos = pos;
Victor Stinnere64322e2012-10-30 23:12:47 +01005597 }
5598
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005599 if (Py_UNICODE_IS_SURROGATE(ch)) {
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005600 errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005601 startinpos = ((const char *)q) - starts;
5602 endinpos = startinpos + 4;
5603 }
5604 else if (ch <= maxch) {
Victor Stinnere64322e2012-10-30 23:12:47 +01005605 if (q == e || consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005606 break;
Victor Stinnere64322e2012-10-30 23:12:47 +01005607 /* remaining bytes at the end? (size should be divisible by 4) */
Benjamin Peterson29060642009-01-31 22:14:21 +00005608 errmsg = "truncated data";
Victor Stinnere64322e2012-10-30 23:12:47 +01005609 startinpos = ((const char *)q) - starts;
5610 endinpos = ((const char *)e) - starts;
Benjamin Peterson29060642009-01-31 22:14:21 +00005611 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005612 else {
5613 if (ch < 0x110000) {
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005614 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Victor Stinnere64322e2012-10-30 23:12:47 +01005615 goto onError;
5616 q += 4;
5617 continue;
5618 }
Serhiy Storchakad3faf432015-01-18 11:28:37 +02005619 errmsg = "code point not in range(0x110000)";
Victor Stinnere64322e2012-10-30 23:12:47 +01005620 startinpos = ((const char *)q) - starts;
5621 endinpos = startinpos + 4;
Benjamin Peterson29060642009-01-31 22:14:21 +00005622 }
Victor Stinnere64322e2012-10-30 23:12:47 +01005623
5624 /* The remaining input chars are ignored if the callback
5625 chooses to skip the input */
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005626 if (unicode_decode_call_errorhandler_writer(
Benjamin Peterson29060642009-01-31 22:14:21 +00005627 errors, &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005628 encoding, errmsg,
Benjamin Peterson29060642009-01-31 22:14:21 +00005629 &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005630 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005631 goto onError;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005632 }
5633
Walter Dörwald41980ca2007-08-16 21:55:45 +00005634 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005635 *consumed = (const char *)q-starts;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005636
Walter Dörwald41980ca2007-08-16 21:55:45 +00005637 Py_XDECREF(errorHandler);
5638 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005639 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005640
Benjamin Peterson29060642009-01-31 22:14:21 +00005641 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005642 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005643 Py_XDECREF(errorHandler);
5644 Py_XDECREF(exc);
5645 return NULL;
5646}
5647
5648PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005649_PyUnicode_EncodeUTF32(PyObject *str,
5650 const char *errors,
5651 int byteorder)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005652{
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005653 enum PyUnicode_Kind kind;
5654 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005655 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005656 PyObject *v;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005657 uint32_t *out;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005658#if PY_LITTLE_ENDIAN
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005659 int native_ordering = byteorder <= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005660#else
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005661 int native_ordering = byteorder >= 0;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005662#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005663 const char *encoding;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005664 Py_ssize_t nsize, pos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005665 PyObject *errorHandler = NULL;
5666 PyObject *exc = NULL;
5667 PyObject *rep = NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005668
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005669 if (!PyUnicode_Check(str)) {
5670 PyErr_BadArgument();
5671 return NULL;
5672 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05005673 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005674 return NULL;
5675 kind = PyUnicode_KIND(str);
5676 data = PyUnicode_DATA(str);
5677 len = PyUnicode_GET_LENGTH(str);
5678
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005679 if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
Serhiy Storchaka30793282014-01-04 22:44:01 +02005680 return PyErr_NoMemory();
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005681 nsize = len + (byteorder == 0);
Mark Dickinsonc04ddff2012-10-06 18:04:49 +01005682 v = PyBytes_FromStringAndSize(NULL, nsize * 4);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005683 if (v == NULL)
5684 return NULL;
5685
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005686 /* output buffer is 4-bytes aligned */
5687 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005688 out = (uint32_t *)PyBytes_AS_STRING(v);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005689 if (byteorder == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005690 *out++ = 0xFEFF;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005691 if (len == 0)
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005692 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005693
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005694 if (byteorder == -1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005695 encoding = "utf-32-le";
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005696 else if (byteorder == 1)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005697 encoding = "utf-32-be";
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005698 else
5699 encoding = "utf-32";
5700
5701 if (kind == PyUnicode_1BYTE_KIND) {
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005702 ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5703 goto done;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005704 }
5705
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005706 pos = 0;
5707 while (pos < len) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005708 Py_ssize_t repsize, moreunits;
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005709
5710 if (kind == PyUnicode_2BYTE_KIND) {
5711 pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5712 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005713 }
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005714 else {
5715 assert(kind == PyUnicode_4BYTE_KIND);
5716 pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5717 &out, native_ordering);
5718 }
5719 if (pos == len)
5720 break;
Guido van Rossum98297ee2007-11-06 21:34:58 +00005721
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005722 rep = unicode_encode_call_errorhandler(
5723 errors, &errorHandler,
5724 encoding, "surrogates not allowed",
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005725 str, &exc, pos, pos + 1, &pos);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005726 if (!rep)
5727 goto error;
5728
5729 if (PyBytes_Check(rep)) {
5730 repsize = PyBytes_GET_SIZE(rep);
5731 if (repsize & 3) {
5732 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005733 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005734 "surrogates not allowed");
5735 goto error;
5736 }
5737 moreunits = repsize / 4;
5738 }
5739 else {
5740 assert(PyUnicode_Check(rep));
5741 if (PyUnicode_READY(rep) < 0)
5742 goto error;
5743 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5744 if (!PyUnicode_IS_ASCII(rep)) {
5745 raise_encode_exception(&exc, encoding,
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005746 str, pos - 1, pos,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005747 "surrogates not allowed");
5748 goto error;
5749 }
5750 }
5751
5752 /* four bytes are reserved for each surrogate */
5753 if (moreunits > 1) {
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005754 Py_ssize_t outpos = out - (uint32_t*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005755 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 4) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005756 /* integer overflow */
5757 PyErr_NoMemory();
5758 goto error;
5759 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03005760 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 4 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005761 goto error;
Benjamin Peterson9b3d7702016-09-06 13:24:00 -07005762 out = (uint32_t*) PyBytes_AS_STRING(v) + outpos;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005763 }
5764
5765 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02005766 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005767 out += moreunits;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005768 } else /* rep is unicode */ {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005769 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005770 ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5771 &out, native_ordering);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005772 }
5773
5774 Py_CLEAR(rep);
5775 }
5776
5777 /* Cut back to size actually needed. This is necessary for, for example,
5778 encoding of a string containing isolated surrogates and the 'ignore'
5779 handler is used. */
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005780 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005781 if (nsize != PyBytes_GET_SIZE(v))
5782 _PyBytes_Resize(&v, nsize);
5783 Py_XDECREF(errorHandler);
5784 Py_XDECREF(exc);
Serhiy Storchaka0d4df752015-05-12 23:12:45 +03005785 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005786 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005787 error:
5788 Py_XDECREF(rep);
5789 Py_XDECREF(errorHandler);
5790 Py_XDECREF(exc);
5791 Py_XDECREF(v);
5792 return NULL;
Walter Dörwald41980ca2007-08-16 21:55:45 +00005793}
5794
Alexander Belopolsky40018472011-02-26 01:02:56 +00005795PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005796PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5797 Py_ssize_t size,
5798 const char *errors,
5799 int byteorder)
5800{
5801 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02005802 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005803 if (tmp == NULL)
5804 return NULL;
5805 result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5806 Py_DECREF(tmp);
5807 return result;
5808}
5809
5810PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00005811PyUnicode_AsUTF32String(PyObject *unicode)
Walter Dörwald41980ca2007-08-16 21:55:45 +00005812{
Victor Stinnerb960b342011-11-20 19:12:52 +01005813 return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
Walter Dörwald41980ca2007-08-16 21:55:45 +00005814}
5815
Guido van Rossumd57fd912000-03-10 22:53:23 +00005816/* --- UTF-16 Codec ------------------------------------------------------- */
5817
Tim Peters772747b2001-08-09 22:21:55 +00005818PyObject *
5819PyUnicode_DecodeUTF16(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005820 Py_ssize_t size,
5821 const char *errors,
5822 int *byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005823{
Walter Dörwald69652032004-09-07 20:24:22 +00005824 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5825}
5826
5827PyObject *
5828PyUnicode_DecodeUTF16Stateful(const char *s,
Benjamin Peterson29060642009-01-31 22:14:21 +00005829 Py_ssize_t size,
5830 const char *errors,
5831 int *byteorder,
5832 Py_ssize_t *consumed)
Walter Dörwald69652032004-09-07 20:24:22 +00005833{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005834 const char *starts = s;
Martin v. Löwis18e16552006-02-15 17:27:45 +00005835 Py_ssize_t startinpos;
5836 Py_ssize_t endinpos;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005837 _PyUnicodeWriter writer;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005838 const unsigned char *q, *e;
Tim Peters772747b2001-08-09 22:21:55 +00005839 int bo = 0; /* assume native ordering by default */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005840 int native_ordering;
Marc-André Lemburg9542f482000-07-17 18:23:13 +00005841 const char *errmsg = "";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005842 PyObject *errorHandler = NULL;
5843 PyObject *exc = NULL;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005844 const char *encoding;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005845
Andy Lestere6be9b52020-02-11 20:28:35 -06005846 q = (const unsigned char *)s;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005847 e = q + size;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005848
5849 if (byteorder)
Tim Peters772747b2001-08-09 22:21:55 +00005850 bo = *byteorder;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005851
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005852 /* Check for BOM marks (U+FEFF) in the input and adjust current
5853 byte order setting accordingly. In native mode, the leading BOM
5854 mark is skipped, in all other modes, it is copied to the output
5855 stream as-is (giving a ZWNBSP character). */
Antoine Pitrou63065d72012-05-15 23:48:04 +02005856 if (bo == 0 && size >= 2) {
5857 const Py_UCS4 bom = (q[1] << 8) | q[0];
5858 if (bom == 0xFEFF) {
5859 q += 2;
5860 bo = -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00005861 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005862 else if (bom == 0xFFFE) {
5863 q += 2;
5864 bo = 1;
5865 }
5866 if (byteorder)
5867 *byteorder = bo;
Marc-André Lemburg489b56e2001-05-21 20:30:15 +00005868 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00005869
Antoine Pitrou63065d72012-05-15 23:48:04 +02005870 if (q == e) {
5871 if (consumed)
5872 *consumed = size;
Serhiy Storchaka678db842013-01-26 12:16:36 +02005873 _Py_RETURN_UNICODE_EMPTY();
Tim Peters772747b2001-08-09 22:21:55 +00005874 }
Antoine Pitrou63065d72012-05-15 23:48:04 +02005875
Christian Heimes743e0cd2012-10-17 23:52:17 +02005876#if PY_LITTLE_ENDIAN
Antoine Pitrou63065d72012-05-15 23:48:04 +02005877 native_ordering = bo <= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005878 encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
Antoine Pitrouab868312009-01-10 15:40:25 +00005879#else
Antoine Pitrou63065d72012-05-15 23:48:04 +02005880 native_ordering = bo >= 0;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005881 encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
Antoine Pitrouab868312009-01-10 15:40:25 +00005882#endif
Tim Peters772747b2001-08-09 22:21:55 +00005883
Antoine Pitrou63065d72012-05-15 23:48:04 +02005884 /* Note: size will always be longer than the resulting Unicode
Xiang Zhang2c7fd462018-01-31 20:48:05 +08005885 character count normally. Error handler will take care of
5886 resizing when needed. */
Victor Stinner8f674cc2013-04-17 23:02:17 +02005887 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02005888 writer.min_length = (e - q + 1) / 2;
5889 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005890 goto onError;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005891
Antoine Pitrou63065d72012-05-15 23:48:04 +02005892 while (1) {
5893 Py_UCS4 ch = 0;
5894 if (e - q >= 2) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005895 int kind = writer.kind;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005896 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005897 if (PyUnicode_IS_ASCII(writer.buffer))
Antoine Pitrou63065d72012-05-15 23:48:04 +02005898 ch = asciilib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005899 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005900 native_ordering);
5901 else
5902 ch = ucs1lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005903 (Py_UCS1*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005904 native_ordering);
5905 } else if (kind == PyUnicode_2BYTE_KIND) {
5906 ch = ucs2lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005907 (Py_UCS2*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005908 native_ordering);
5909 } else {
5910 assert(kind == PyUnicode_4BYTE_KIND);
5911 ch = ucs4lib_utf16_decode(&q, e,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005912 (Py_UCS4*)writer.data, &writer.pos,
Antoine Pitrou63065d72012-05-15 23:48:04 +02005913 native_ordering);
Antoine Pitrouab868312009-01-10 15:40:25 +00005914 }
Antoine Pitrouab868312009-01-10 15:40:25 +00005915 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005916
Antoine Pitrou63065d72012-05-15 23:48:04 +02005917 switch (ch)
5918 {
5919 case 0:
5920 /* remaining byte at the end? (size should be even) */
5921 if (q == e || consumed)
5922 goto End;
5923 errmsg = "truncated data";
5924 startinpos = ((const char *)q) - starts;
5925 endinpos = ((const char *)e) - starts;
5926 break;
5927 /* The remaining input chars are ignored if the callback
5928 chooses to skip the input */
5929 case 1:
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005930 q -= 2;
5931 if (consumed)
Serhiy Storchakaae3b32a2013-01-08 23:40:52 +02005932 goto End;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005933 errmsg = "unexpected end of data";
Serhiy Storchaka48e188e2013-01-08 23:14:24 +02005934 startinpos = ((const char *)q) - starts;
Antoine Pitrou63065d72012-05-15 23:48:04 +02005935 endinpos = ((const char *)e) - starts;
5936 break;
5937 case 2:
5938 errmsg = "illegal encoding";
5939 startinpos = ((const char *)q) - 2 - starts;
5940 endinpos = startinpos + 2;
5941 break;
5942 case 3:
5943 errmsg = "illegal UTF-16 surrogate";
5944 startinpos = ((const char *)q) - 4 - starts;
5945 endinpos = startinpos + 2;
5946 break;
5947 default:
Victor Stinner8a1a6cf2013-04-14 02:35:33 +02005948 if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
Martin v. Löwise9b11c12011-11-08 17:35:34 +01005949 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00005950 continue;
5951 }
5952
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005953 if (unicode_decode_call_errorhandler_writer(
Antoine Pitrouab868312009-01-10 15:40:25 +00005954 errors,
5955 &errorHandler,
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005956 encoding, errmsg,
Antoine Pitrouab868312009-01-10 15:40:25 +00005957 &starts,
5958 (const char **)&e,
5959 &startinpos,
5960 &endinpos,
5961 &exc,
5962 (const char **)&q,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005963 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00005964 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00005965 }
5966
Antoine Pitrou63065d72012-05-15 23:48:04 +02005967End:
Walter Dörwald69652032004-09-07 20:24:22 +00005968 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00005969 *consumed = (const char *)q-starts;
Walter Dörwald69652032004-09-07 20:24:22 +00005970
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005971 Py_XDECREF(errorHandler);
5972 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005973 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005974
Benjamin Peterson29060642009-01-31 22:14:21 +00005975 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01005976 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00005977 Py_XDECREF(errorHandler);
5978 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00005979 return NULL;
5980}
5981
Tim Peters772747b2001-08-09 22:21:55 +00005982PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005983_PyUnicode_EncodeUTF16(PyObject *str,
5984 const char *errors,
5985 int byteorder)
Guido van Rossumd57fd912000-03-10 22:53:23 +00005986{
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005987 enum PyUnicode_Kind kind;
5988 const void *data;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01005989 Py_ssize_t len;
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00005990 PyObject *v;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005991 unsigned short *out;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005992 Py_ssize_t pairs;
Christian Heimes743e0cd2012-10-17 23:52:17 +02005993#if PY_BIG_ENDIAN
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005994 int native_ordering = byteorder >= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005995#else
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02005996 int native_ordering = byteorder <= 0;
Tim Peters772747b2001-08-09 22:21:55 +00005997#endif
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02005998 const char *encoding;
5999 Py_ssize_t nsize, pos;
6000 PyObject *errorHandler = NULL;
6001 PyObject *exc = NULL;
6002 PyObject *rep = NULL;
Tim Peters772747b2001-08-09 22:21:55 +00006003
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006004 if (!PyUnicode_Check(str)) {
6005 PyErr_BadArgument();
6006 return NULL;
6007 }
Benjamin Petersonbac79492012-01-14 13:34:47 -05006008 if (PyUnicode_READY(str) == -1)
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006009 return NULL;
6010 kind = PyUnicode_KIND(str);
6011 data = PyUnicode_DATA(str);
6012 len = PyUnicode_GET_LENGTH(str);
Victor Stinner0e368262011-11-10 20:12:49 +01006013
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006014 pairs = 0;
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006015 if (kind == PyUnicode_4BYTE_KIND) {
6016 const Py_UCS4 *in = (const Py_UCS4 *)data;
6017 const Py_UCS4 *end = in + len;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006018 while (in < end) {
6019 if (*in++ >= 0x10000) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006020 pairs++;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006021 }
6022 }
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006023 }
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006024 if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006025 return PyErr_NoMemory();
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006026 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006027 nsize = len + pairs + (byteorder == 0);
6028 v = PyBytes_FromStringAndSize(NULL, nsize * 2);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006029 if (v == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006030 return NULL;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006031 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006032
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006033 /* output buffer is 2-bytes aligned */
Antoine Pitrouca8aa4a2012-09-20 20:56:47 +02006034 assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006035 out = (unsigned short *)PyBytes_AS_STRING(v);
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006036 if (byteorder == 0) {
Antoine Pitrou27f6a3b2012-06-15 22:15:23 +02006037 *out++ = 0xFEFF;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006038 }
6039 if (len == 0) {
Guido van Rossum98297ee2007-11-06 21:34:58 +00006040 goto done;
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006041 }
Tim Peters772747b2001-08-09 22:21:55 +00006042
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006043 if (kind == PyUnicode_1BYTE_KIND) {
6044 ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
6045 goto done;
Martin v. Löwis0ba70cc2001-06-26 22:22:37 +00006046 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006047
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006048 if (byteorder < 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006049 encoding = "utf-16-le";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006050 }
6051 else if (byteorder > 0) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006052 encoding = "utf-16-be";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006053 }
6054 else {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006055 encoding = "utf-16";
Victor Stinner1a05d6c2016-09-02 12:12:23 +02006056 }
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006057
6058 pos = 0;
6059 while (pos < len) {
6060 Py_ssize_t repsize, moreunits;
6061
6062 if (kind == PyUnicode_2BYTE_KIND) {
6063 pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
6064 &out, native_ordering);
6065 }
6066 else {
6067 assert(kind == PyUnicode_4BYTE_KIND);
6068 pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
6069 &out, native_ordering);
6070 }
6071 if (pos == len)
6072 break;
6073
6074 rep = unicode_encode_call_errorhandler(
6075 errors, &errorHandler,
6076 encoding, "surrogates not allowed",
6077 str, &exc, pos, pos + 1, &pos);
6078 if (!rep)
6079 goto error;
6080
6081 if (PyBytes_Check(rep)) {
6082 repsize = PyBytes_GET_SIZE(rep);
6083 if (repsize & 1) {
6084 raise_encode_exception(&exc, encoding,
6085 str, pos - 1, pos,
6086 "surrogates not allowed");
6087 goto error;
6088 }
6089 moreunits = repsize / 2;
6090 }
6091 else {
6092 assert(PyUnicode_Check(rep));
6093 if (PyUnicode_READY(rep) < 0)
6094 goto error;
6095 moreunits = repsize = PyUnicode_GET_LENGTH(rep);
6096 if (!PyUnicode_IS_ASCII(rep)) {
6097 raise_encode_exception(&exc, encoding,
6098 str, pos - 1, pos,
6099 "surrogates not allowed");
6100 goto error;
6101 }
6102 }
6103
6104 /* two bytes are reserved for each surrogate */
6105 if (moreunits > 1) {
6106 Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006107 if (moreunits >= (PY_SSIZE_T_MAX - PyBytes_GET_SIZE(v)) / 2) {
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006108 /* integer overflow */
6109 PyErr_NoMemory();
6110 goto error;
6111 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03006112 if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + 2 * (moreunits - 1)) < 0)
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006113 goto error;
6114 out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
6115 }
6116
6117 if (PyBytes_Check(rep)) {
Christian Heimesf051e432016-09-13 20:22:02 +02006118 memcpy(out, PyBytes_AS_STRING(rep), repsize);
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006119 out += moreunits;
6120 } else /* rep is unicode */ {
6121 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
6122 ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
6123 &out, native_ordering);
6124 }
6125
6126 Py_CLEAR(rep);
6127 }
6128
6129 /* Cut back to size actually needed. This is necessary for, for example,
6130 encoding of a string containing isolated surrogates and the 'ignore' handler
6131 is used. */
6132 nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
6133 if (nsize != PyBytes_GET_SIZE(v))
6134 _PyBytes_Resize(&v, nsize);
6135 Py_XDECREF(errorHandler);
6136 Py_XDECREF(exc);
Guido van Rossum98297ee2007-11-06 21:34:58 +00006137 done:
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00006138 return v;
Serhiy Storchaka58cf6072013-11-19 11:32:41 +02006139 error:
6140 Py_XDECREF(rep);
6141 Py_XDECREF(errorHandler);
6142 Py_XDECREF(exc);
6143 Py_XDECREF(v);
6144 return NULL;
6145#undef STORECHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006146}
6147
Alexander Belopolsky40018472011-02-26 01:02:56 +00006148PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006149PyUnicode_EncodeUTF16(const Py_UNICODE *s,
6150 Py_ssize_t size,
6151 const char *errors,
6152 int byteorder)
6153{
6154 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006155 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006156 if (tmp == NULL)
6157 return NULL;
6158 result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
6159 Py_DECREF(tmp);
6160 return result;
6161}
6162
6163PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00006164PyUnicode_AsUTF16String(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006165{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006166 return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006167}
6168
6169/* --- Unicode Escape Codec ----------------------------------------------- */
6170
Fredrik Lundh06d12682001-01-24 07:59:11 +00006171static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
Marc-André Lemburg0f774e32000-06-28 16:43:35 +00006172
Alexander Belopolsky40018472011-02-26 01:02:56 +00006173PyObject *
Eric V. Smith42454af2016-10-31 09:22:08 -04006174_PyUnicode_DecodeUnicodeEscape(const char *s,
6175 Py_ssize_t size,
6176 const char *errors,
6177 const char **first_invalid_escape)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006178{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006179 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006180 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006181 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006182 PyObject *errorHandler = NULL;
6183 PyObject *exc = NULL;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006184
Eric V. Smith42454af2016-10-31 09:22:08 -04006185 // so we can remember if we've seen an invalid escape char or not
6186 *first_invalid_escape = NULL;
6187
Victor Stinner62ec3312016-09-06 17:04:34 -07006188 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006189 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006190 }
6191 /* Escaped strings will always be longer than the resulting
6192 Unicode string, so we start with size here and then reduce the
6193 length after conversion to the true value.
6194 (but if the error callback returns a long replacement string
6195 we'll have to allocate more space) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006196 _PyUnicodeWriter_Init(&writer);
Victor Stinner62ec3312016-09-06 17:04:34 -07006197 writer.min_length = size;
6198 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6199 goto onError;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006200 }
6201
Guido van Rossumd57fd912000-03-10 22:53:23 +00006202 end = s + size;
6203 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006204 unsigned char c = (unsigned char) *s++;
6205 Py_UCS4 ch;
6206 int count;
6207 Py_ssize_t startinpos;
6208 Py_ssize_t endinpos;
6209 const char *message;
6210
6211#define WRITE_ASCII_CHAR(ch) \
6212 do { \
6213 assert(ch <= 127); \
6214 assert(writer.pos < writer.size); \
6215 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6216 } while(0)
6217
6218#define WRITE_CHAR(ch) \
6219 do { \
6220 if (ch <= writer.maxchar) { \
6221 assert(writer.pos < writer.size); \
6222 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6223 } \
6224 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6225 goto onError; \
6226 } \
6227 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006228
6229 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006230 if (c != '\\') {
6231 WRITE_CHAR(c);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006232 continue;
6233 }
6234
Victor Stinner62ec3312016-09-06 17:04:34 -07006235 startinpos = s - starts - 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006236 /* \ - Escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006237 if (s >= end) {
6238 message = "\\ at end of string";
6239 goto error;
6240 }
6241 c = (unsigned char) *s++;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006242
Victor Stinner62ec3312016-09-06 17:04:34 -07006243 assert(writer.pos < writer.size);
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006244 switch (c) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006245
Benjamin Peterson29060642009-01-31 22:14:21 +00006246 /* \x escapes */
Victor Stinner62ec3312016-09-06 17:04:34 -07006247 case '\n': continue;
6248 case '\\': WRITE_ASCII_CHAR('\\'); continue;
6249 case '\'': WRITE_ASCII_CHAR('\''); continue;
6250 case '\"': WRITE_ASCII_CHAR('\"'); continue;
6251 case 'b': WRITE_ASCII_CHAR('\b'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006252 /* FF */
Victor Stinner62ec3312016-09-06 17:04:34 -07006253 case 'f': WRITE_ASCII_CHAR('\014'); continue;
6254 case 't': WRITE_ASCII_CHAR('\t'); continue;
6255 case 'n': WRITE_ASCII_CHAR('\n'); continue;
6256 case 'r': WRITE_ASCII_CHAR('\r'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006257 /* VT */
Victor Stinner62ec3312016-09-06 17:04:34 -07006258 case 'v': WRITE_ASCII_CHAR('\013'); continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006259 /* BEL, not classic C */
Victor Stinner62ec3312016-09-06 17:04:34 -07006260 case 'a': WRITE_ASCII_CHAR('\007'); continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006261
Benjamin Peterson29060642009-01-31 22:14:21 +00006262 /* \OOO (octal) escapes */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006263 case '0': case '1': case '2': case '3':
6264 case '4': case '5': case '6': case '7':
Victor Stinner62ec3312016-09-06 17:04:34 -07006265 ch = c - '0';
Guido van Rossum8ce8a782007-11-01 19:42:39 +00006266 if (s < end && '0' <= *s && *s <= '7') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006267 ch = (ch<<3) + *s++ - '0';
6268 if (s < end && '0' <= *s && *s <= '7') {
6269 ch = (ch<<3) + *s++ - '0';
6270 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006271 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006272 WRITE_CHAR(ch);
6273 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006274
Benjamin Peterson29060642009-01-31 22:14:21 +00006275 /* hex escapes */
6276 /* \xXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006277 case 'x':
Victor Stinner62ec3312016-09-06 17:04:34 -07006278 count = 2;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006279 message = "truncated \\xXX escape";
6280 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006281
Benjamin Peterson29060642009-01-31 22:14:21 +00006282 /* \uXXXX */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006283 case 'u':
Victor Stinner62ec3312016-09-06 17:04:34 -07006284 count = 4;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006285 message = "truncated \\uXXXX escape";
6286 goto hexescape;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006287
Benjamin Peterson29060642009-01-31 22:14:21 +00006288 /* \UXXXXXXXX */
Fredrik Lundhdf846752000-09-03 11:29:49 +00006289 case 'U':
Victor Stinner62ec3312016-09-06 17:04:34 -07006290 count = 8;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006291 message = "truncated \\UXXXXXXXX escape";
6292 hexescape:
Victor Stinner62ec3312016-09-06 17:04:34 -07006293 for (ch = 0; count && s < end; ++s, --count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006294 c = (unsigned char)*s;
Victor Stinner62ec3312016-09-06 17:04:34 -07006295 ch <<= 4;
6296 if (c >= '0' && c <= '9') {
6297 ch += c - '0';
6298 }
6299 else if (c >= 'a' && c <= 'f') {
6300 ch += c - ('a' - 10);
6301 }
6302 else if (c >= 'A' && c <= 'F') {
6303 ch += c - ('A' - 10);
6304 }
6305 else {
6306 break;
6307 }
Fredrik Lundhdf846752000-09-03 11:29:49 +00006308 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006309 if (count) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006310 goto error;
Victor Stinner62ec3312016-09-06 17:04:34 -07006311 }
6312
6313 /* when we get here, ch is a 32-bit unicode character */
6314 if (ch > MAX_UNICODE) {
6315 message = "illegal Unicode character";
6316 goto error;
6317 }
6318
6319 WRITE_CHAR(ch);
6320 continue;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006321
Benjamin Peterson29060642009-01-31 22:14:21 +00006322 /* \N{name} */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006323 case 'N':
Fredrik Lundhccc74732001-02-18 22:13:49 +00006324 if (ucnhash_CAPI == NULL) {
6325 /* load the unicode data module */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006326 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
6327 PyUnicodeData_CAPSULE_NAME, 1);
Victor Stinner62ec3312016-09-06 17:04:34 -07006328 if (ucnhash_CAPI == NULL) {
6329 PyErr_SetString(
6330 PyExc_UnicodeError,
6331 "\\N escapes not supported (can't load unicodedata module)"
6332 );
6333 goto onError;
6334 }
Fredrik Lundhccc74732001-02-18 22:13:49 +00006335 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006336
6337 message = "malformed \\N character escape";
Gregory P. Smith746b2d32018-11-13 13:16:54 -08006338 if (s < end && *s == '{') {
Victor Stinner62ec3312016-09-06 17:04:34 -07006339 const char *start = ++s;
6340 size_t namelen;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006341 /* look for the closing brace */
Victor Stinner62ec3312016-09-06 17:04:34 -07006342 while (s < end && *s != '}')
Fredrik Lundhccc74732001-02-18 22:13:49 +00006343 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006344 namelen = s - start;
6345 if (namelen && s < end) {
Fredrik Lundhccc74732001-02-18 22:13:49 +00006346 /* found a name. look it up in the unicode database */
Fredrik Lundhccc74732001-02-18 22:13:49 +00006347 s++;
Victor Stinner62ec3312016-09-06 17:04:34 -07006348 ch = 0xffffffff; /* in case 'getcode' messes up */
6349 if (namelen <= INT_MAX &&
6350 ucnhash_CAPI->getcode(NULL, start, (int)namelen,
6351 &ch, 0)) {
6352 assert(ch <= MAX_UNICODE);
6353 WRITE_CHAR(ch);
6354 continue;
6355 }
6356 message = "unknown Unicode character name";
Fredrik Lundhccc74732001-02-18 22:13:49 +00006357 }
6358 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006359 goto error;
Fredrik Lundhccc74732001-02-18 22:13:49 +00006360
6361 default:
Eric V. Smith42454af2016-10-31 09:22:08 -04006362 if (*first_invalid_escape == NULL) {
6363 *first_invalid_escape = s-1; /* Back up one char, since we've
6364 already incremented s. */
6365 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006366 WRITE_ASCII_CHAR('\\');
6367 WRITE_CHAR(c);
6368 continue;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006369 }
Serhiy Storchakad6793772013-01-29 10:20:44 +02006370
6371 error:
6372 endinpos = s-starts;
Victor Stinner62ec3312016-09-06 17:04:34 -07006373 writer.min_length = end - s + writer.pos;
Serhiy Storchaka8fe5a9f2013-01-29 10:37:39 +02006374 if (unicode_decode_call_errorhandler_writer(
Serhiy Storchakad6793772013-01-29 10:20:44 +02006375 errors, &errorHandler,
6376 "unicodeescape", message,
6377 &starts, &end, &startinpos, &endinpos, &exc, &s,
Victor Stinner62ec3312016-09-06 17:04:34 -07006378 &writer)) {
Serhiy Storchakad6793772013-01-29 10:20:44 +02006379 goto onError;
Victor Stinner62ec3312016-09-06 17:04:34 -07006380 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006381 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006382
6383#undef WRITE_ASCII_CHAR
6384#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006385 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02006386
Walter Dörwaldd4ade082003-08-15 15:00:26 +00006387 Py_XDECREF(errorHandler);
6388 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006389 return _PyUnicodeWriter_Finish(&writer);
Walter Dörwald8c077222002-03-25 11:16:18 +00006390
Benjamin Peterson29060642009-01-31 22:14:21 +00006391 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006392 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006393 Py_XDECREF(errorHandler);
6394 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006395 return NULL;
6396}
6397
Eric V. Smith42454af2016-10-31 09:22:08 -04006398PyObject *
6399PyUnicode_DecodeUnicodeEscape(const char *s,
6400 Py_ssize_t size,
6401 const char *errors)
6402{
6403 const char *first_invalid_escape;
6404 PyObject *result = _PyUnicode_DecodeUnicodeEscape(s, size, errors,
6405 &first_invalid_escape);
6406 if (result == NULL)
6407 return NULL;
6408 if (first_invalid_escape != NULL) {
6409 if (PyErr_WarnFormat(PyExc_DeprecationWarning, 1,
6410 "invalid escape sequence '\\%c'",
Serhiy Storchaka56cb4652017-10-20 17:08:15 +03006411 (unsigned char)*first_invalid_escape) < 0) {
Eric V. Smith42454af2016-10-31 09:22:08 -04006412 Py_DECREF(result);
6413 return NULL;
6414 }
6415 }
6416 return result;
6417}
6418
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006419/* Return a Unicode-Escape string version of the Unicode object. */
Guido van Rossumd57fd912000-03-10 22:53:23 +00006420
Alexander Belopolsky40018472011-02-26 01:02:56 +00006421PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006422PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006423{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006424 Py_ssize_t i, len;
Victor Stinner62ec3312016-09-06 17:04:34 -07006425 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006426 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006427 enum PyUnicode_Kind kind;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006428 void *data;
Victor Stinner62ec3312016-09-06 17:04:34 -07006429 Py_ssize_t expandsize;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006430
Ezio Melottie7f90372012-10-05 03:33:31 +03006431 /* Initial allocation is based on the longest-possible character
Thomas Wouters89f507f2006-12-13 04:49:30 +00006432 escape.
6433
Ezio Melottie7f90372012-10-05 03:33:31 +03006434 For UCS1 strings it's '\xxx', 4 bytes per source character.
6435 For UCS2 strings it's '\uxxxx', 6 bytes per source character.
6436 For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
Thomas Wouters89f507f2006-12-13 04:49:30 +00006437 */
6438
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006439 if (!PyUnicode_Check(unicode)) {
6440 PyErr_BadArgument();
6441 return NULL;
6442 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006443 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006444 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006445 }
Victor Stinner358af132015-10-12 22:36:57 +02006446
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006447 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006448 if (len == 0) {
6449 return PyBytes_FromStringAndSize(NULL, 0);
6450 }
6451
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006452 kind = PyUnicode_KIND(unicode);
6453 data = PyUnicode_DATA(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006454 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6455 bytes, and 1 byte characters 4. */
6456 expandsize = kind * 2 + 2;
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006457 if (len > PY_SSIZE_T_MAX / expandsize) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006458 return PyErr_NoMemory();
6459 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006460 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
Victor Stinner62ec3312016-09-06 17:04:34 -07006461 if (repr == NULL) {
6462 return NULL;
6463 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006464
Victor Stinner62ec3312016-09-06 17:04:34 -07006465 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006466 for (i = 0; i < len; i++) {
Victor Stinner3326cb62011-11-10 20:15:25 +01006467 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006468
Victor Stinner62ec3312016-09-06 17:04:34 -07006469 /* U+0000-U+00ff range */
6470 if (ch < 0x100) {
6471 if (ch >= ' ' && ch < 127) {
6472 if (ch != '\\') {
6473 /* Copy printable US ASCII as-is */
6474 *p++ = (char) ch;
6475 }
6476 /* Escape backslashes */
6477 else {
6478 *p++ = '\\';
6479 *p++ = '\\';
6480 }
6481 }
Victor Stinner358af132015-10-12 22:36:57 +02006482
Victor Stinner62ec3312016-09-06 17:04:34 -07006483 /* Map special whitespace to '\t', \n', '\r' */
6484 else if (ch == '\t') {
6485 *p++ = '\\';
6486 *p++ = 't';
6487 }
6488 else if (ch == '\n') {
6489 *p++ = '\\';
6490 *p++ = 'n';
6491 }
6492 else if (ch == '\r') {
6493 *p++ = '\\';
6494 *p++ = 'r';
6495 }
6496
6497 /* Map non-printable US ASCII and 8-bit characters to '\xHH' */
6498 else {
6499 *p++ = '\\';
6500 *p++ = 'x';
6501 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6502 *p++ = Py_hexdigits[ch & 0x000F];
6503 }
Tim Petersced69f82003-09-16 20:30:58 +00006504 }
Serhiy Storchakaac0720e2016-11-21 11:46:51 +02006505 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006506 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006507 *p++ = '\\';
6508 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006509 *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6510 *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6511 *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6512 *p++ = Py_hexdigits[ch & 0x000F];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006513 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006514 /* U+010000-U+10ffff range: Map 21-bit characters to '\U00HHHHHH' */
6515 else {
Marc-André Lemburg80d1dd52001-07-25 16:05:59 +00006516
Victor Stinner62ec3312016-09-06 17:04:34 -07006517 /* Make sure that the first two digits are zero */
6518 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006519 *p++ = '\\';
Victor Stinner62ec3312016-09-06 17:04:34 -07006520 *p++ = 'U';
6521 *p++ = '0';
6522 *p++ = '0';
6523 *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
6524 *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
6525 *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
6526 *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
6527 *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
6528 *p++ = Py_hexdigits[ch & 0x0000000F];
Ka-Ping Yeefa004ad2001-01-24 17:19:08 +00006529 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006530 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006531
Victor Stinner62ec3312016-09-06 17:04:34 -07006532 assert(p - PyBytes_AS_STRING(repr) > 0);
6533 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6534 return NULL;
6535 }
6536 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006537}
6538
Alexander Belopolsky40018472011-02-26 01:02:56 +00006539PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006540PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6541 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006542{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006543 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006544 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Victor Stinner62ec3312016-09-06 17:04:34 -07006545 if (tmp == NULL) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006546 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006547 }
6548
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006549 result = PyUnicode_AsUnicodeEscapeString(tmp);
6550 Py_DECREF(tmp);
6551 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006552}
6553
6554/* --- Raw Unicode Escape Codec ------------------------------------------- */
6555
Alexander Belopolsky40018472011-02-26 01:02:56 +00006556PyObject *
6557PyUnicode_DecodeRawUnicodeEscape(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006558 Py_ssize_t size,
6559 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006560{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006561 const char *starts = s;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006562 _PyUnicodeWriter writer;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006563 const char *end;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006564 PyObject *errorHandler = NULL;
6565 PyObject *exc = NULL;
Tim Petersced69f82003-09-16 20:30:58 +00006566
Victor Stinner62ec3312016-09-06 17:04:34 -07006567 if (size == 0) {
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02006568 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner62ec3312016-09-06 17:04:34 -07006569 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006570
Guido van Rossumd57fd912000-03-10 22:53:23 +00006571 /* Escaped strings will always be longer than the resulting
6572 Unicode string, so we start with size here and then reduce the
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006573 length after conversion to the true value. (But decoding error
6574 handler might have to resize the string) */
Victor Stinner8f674cc2013-04-17 23:02:17 +02006575 _PyUnicodeWriter_Init(&writer);
Inada Naoki770847a2019-06-24 12:30:24 +09006576 writer.min_length = size;
Victor Stinner62ec3312016-09-06 17:04:34 -07006577 if (_PyUnicodeWriter_Prepare(&writer, size, 127) < 0) {
6578 goto onError;
6579 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006580
Guido van Rossumd57fd912000-03-10 22:53:23 +00006581 end = s + size;
6582 while (s < end) {
Victor Stinner62ec3312016-09-06 17:04:34 -07006583 unsigned char c = (unsigned char) *s++;
6584 Py_UCS4 ch;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006585 int count;
Victor Stinner62ec3312016-09-06 17:04:34 -07006586 Py_ssize_t startinpos;
6587 Py_ssize_t endinpos;
6588 const char *message;
6589
6590#define WRITE_CHAR(ch) \
6591 do { \
6592 if (ch <= writer.maxchar) { \
6593 assert(writer.pos < writer.size); \
6594 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, ch); \
6595 } \
6596 else if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0) { \
6597 goto onError; \
6598 } \
6599 } while(0)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006600
Benjamin Peterson29060642009-01-31 22:14:21 +00006601 /* Non-escape characters are interpreted as Unicode ordinals */
Victor Stinner62ec3312016-09-06 17:04:34 -07006602 if (c != '\\' || s >= end) {
6603 WRITE_CHAR(c);
Benjamin Peterson29060642009-01-31 22:14:21 +00006604 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006605 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006606
Victor Stinner62ec3312016-09-06 17:04:34 -07006607 c = (unsigned char) *s++;
6608 if (c == 'u') {
6609 count = 4;
6610 message = "truncated \\uXXXX escape";
Benjamin Peterson29060642009-01-31 22:14:21 +00006611 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006612 else if (c == 'U') {
6613 count = 8;
6614 message = "truncated \\UXXXXXXXX escape";
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006615 }
6616 else {
Victor Stinner62ec3312016-09-06 17:04:34 -07006617 assert(writer.pos < writer.size);
6618 PyUnicode_WRITE(writer.kind, writer.data, writer.pos++, '\\');
6619 WRITE_CHAR(c);
6620 continue;
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +00006621 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006622 startinpos = s - starts - 2;
6623
6624 /* \uHHHH with 4 hex digits, \U00HHHHHH with 8 */
6625 for (ch = 0; count && s < end; ++s, --count) {
6626 c = (unsigned char)*s;
6627 ch <<= 4;
6628 if (c >= '0' && c <= '9') {
6629 ch += c - '0';
6630 }
6631 else if (c >= 'a' && c <= 'f') {
6632 ch += c - ('a' - 10);
6633 }
6634 else if (c >= 'A' && c <= 'F') {
6635 ch += c - ('A' - 10);
6636 }
6637 else {
6638 break;
6639 }
6640 }
6641 if (!count) {
6642 if (ch <= MAX_UNICODE) {
6643 WRITE_CHAR(ch);
6644 continue;
6645 }
6646 message = "\\Uxxxxxxxx out of range";
6647 }
6648
6649 endinpos = s-starts;
6650 writer.min_length = end - s + writer.pos;
6651 if (unicode_decode_call_errorhandler_writer(
6652 errors, &errorHandler,
6653 "rawunicodeescape", message,
6654 &starts, &end, &startinpos, &endinpos, &exc, &s,
6655 &writer)) {
6656 goto onError;
6657 }
Serhiy Storchakab7e2d672018-02-13 08:27:33 +02006658 assert(end - s <= writer.size - writer.pos);
Victor Stinner62ec3312016-09-06 17:04:34 -07006659
6660#undef WRITE_CHAR
Guido van Rossumd57fd912000-03-10 22:53:23 +00006661 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006662 Py_XDECREF(errorHandler);
6663 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006664 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00006665
Benjamin Peterson29060642009-01-31 22:14:21 +00006666 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01006667 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006668 Py_XDECREF(errorHandler);
6669 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006670 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006671
Guido van Rossumd57fd912000-03-10 22:53:23 +00006672}
6673
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006674
Alexander Belopolsky40018472011-02-26 01:02:56 +00006675PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006676PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006677{
Victor Stinner62ec3312016-09-06 17:04:34 -07006678 PyObject *repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006679 char *p;
Victor Stinner62ec3312016-09-06 17:04:34 -07006680 Py_ssize_t expandsize, pos;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006681 int kind;
6682 void *data;
6683 Py_ssize_t len;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006684
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006685 if (!PyUnicode_Check(unicode)) {
6686 PyErr_BadArgument();
6687 return NULL;
6688 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006689 if (PyUnicode_READY(unicode) == -1) {
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006690 return NULL;
Victor Stinner62ec3312016-09-06 17:04:34 -07006691 }
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006692 kind = PyUnicode_KIND(unicode);
6693 data = PyUnicode_DATA(unicode);
6694 len = PyUnicode_GET_LENGTH(unicode);
Victor Stinner62ec3312016-09-06 17:04:34 -07006695 if (kind == PyUnicode_1BYTE_KIND) {
6696 return PyBytes_FromStringAndSize(data, len);
6697 }
Victor Stinner0e368262011-11-10 20:12:49 +01006698
Victor Stinner62ec3312016-09-06 17:04:34 -07006699 /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6700 bytes, and 1 byte characters 4. */
6701 expandsize = kind * 2 + 2;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006702
Victor Stinner62ec3312016-09-06 17:04:34 -07006703 if (len > PY_SSIZE_T_MAX / expandsize) {
6704 return PyErr_NoMemory();
6705 }
6706 repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6707 if (repr == NULL) {
6708 return NULL;
6709 }
6710 if (len == 0) {
6711 return repr;
6712 }
6713
6714 p = PyBytes_AS_STRING(repr);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006715 for (pos = 0; pos < len; pos++) {
6716 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Victor Stinner358af132015-10-12 22:36:57 +02006717
Victor Stinner62ec3312016-09-06 17:04:34 -07006718 /* U+0000-U+00ff range: Copy 8-bit characters as-is */
6719 if (ch < 0x100) {
6720 *p++ = (char) ch;
Tim Petersced69f82003-09-16 20:30:58 +00006721 }
Xiang Zhang2b77a922018-02-13 18:33:32 +08006722 /* U+0100-U+ffff range: Map 16-bit characters to '\uHHHH' */
Victor Stinner62ec3312016-09-06 17:04:34 -07006723 else if (ch < 0x10000) {
Guido van Rossumd57fd912000-03-10 22:53:23 +00006724 *p++ = '\\';
6725 *p++ = 'u';
Victor Stinnerf5cff562011-10-14 02:13:11 +02006726 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6727 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6728 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6729 *p++ = Py_hexdigits[ch & 15];
Guido van Rossumd57fd912000-03-10 22:53:23 +00006730 }
Victor Stinner62ec3312016-09-06 17:04:34 -07006731 /* U+010000-U+10ffff range: Map 32-bit characters to '\U00HHHHHH' */
6732 else {
6733 assert(ch <= MAX_UNICODE && MAX_UNICODE <= 0x10ffff);
6734 *p++ = '\\';
6735 *p++ = 'U';
6736 *p++ = '0';
6737 *p++ = '0';
6738 *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6739 *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6740 *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6741 *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6742 *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6743 *p++ = Py_hexdigits[ch & 15];
6744 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00006745 }
Guido van Rossum98297ee2007-11-06 21:34:58 +00006746
Victor Stinner62ec3312016-09-06 17:04:34 -07006747 assert(p > PyBytes_AS_STRING(repr));
6748 if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0) {
6749 return NULL;
6750 }
6751 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006752}
6753
Alexander Belopolsky40018472011-02-26 01:02:56 +00006754PyObject *
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006755PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6756 Py_ssize_t size)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006757{
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006758 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02006759 PyObject *tmp = PyUnicode_FromWideChar(s, size);
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006760 if (tmp == NULL)
Walter Dörwald711005d2007-05-12 12:03:26 +00006761 return NULL;
Martin v. Löwis1db7c132011-11-10 18:24:32 +01006762 result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6763 Py_DECREF(tmp);
6764 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00006765}
6766
6767/* --- Latin-1 Codec ------------------------------------------------------ */
6768
Alexander Belopolsky40018472011-02-26 01:02:56 +00006769PyObject *
6770PyUnicode_DecodeLatin1(const char *s,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006771 Py_ssize_t size,
6772 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00006773{
Guido van Rossumd57fd912000-03-10 22:53:23 +00006774 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
Andy Lestere6be9b52020-02-11 20:28:35 -06006775 return _PyUnicode_FromUCS1((const unsigned char*)s, size);
Guido van Rossumd57fd912000-03-10 22:53:23 +00006776}
6777
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006778/* create or adjust a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006779static void
6780make_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006781 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006782 PyObject *unicode,
6783 Py_ssize_t startpos, Py_ssize_t endpos,
6784 const char *reason)
6785{
6786 if (*exceptionObject == NULL) {
6787 *exceptionObject = PyObject_CallFunction(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006788 PyExc_UnicodeEncodeError, "sOnns",
Martin v. Löwis9e816682011-11-02 12:45:42 +01006789 encoding, unicode, startpos, endpos, reason);
6790 }
6791 else {
6792 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6793 goto onError;
6794 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6795 goto onError;
6796 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6797 goto onError;
6798 return;
6799 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02006800 Py_CLEAR(*exceptionObject);
Martin v. Löwis9e816682011-11-02 12:45:42 +01006801 }
6802}
6803
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006804/* raises a UnicodeEncodeError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006805static void
6806raise_encode_exception(PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006807 const char *encoding,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006808 PyObject *unicode,
6809 Py_ssize_t startpos, Py_ssize_t endpos,
6810 const char *reason)
6811{
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006812 make_encode_exception(exceptionObject,
Martin v. Löwis9e816682011-11-02 12:45:42 +01006813 encoding, unicode, startpos, endpos, reason);
6814 if (*exceptionObject != NULL)
6815 PyCodec_StrictErrors(*exceptionObject);
6816}
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006817
6818/* error handling callback helper:
6819 build arguments, call the callback and check the arguments,
6820 put the result into newpos and return the replacement string, which
6821 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00006822static PyObject *
6823unicode_encode_call_errorhandler(const char *errors,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006824 PyObject **errorHandler,
6825 const char *encoding, const char *reason,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006826 PyObject *unicode, PyObject **exceptionObject,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006827 Py_ssize_t startpos, Py_ssize_t endpos,
6828 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006829{
Serhiy Storchaka2d06e842015-12-25 19:53:18 +02006830 static const char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006831 Py_ssize_t len;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006832 PyObject *restuple;
6833 PyObject *resunicode;
6834
6835 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006836 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006837 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006838 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006839 }
6840
Benjamin Petersonbac79492012-01-14 13:34:47 -05006841 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006842 return NULL;
6843 len = PyUnicode_GET_LENGTH(unicode);
6844
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006845 make_encode_exception(exceptionObject,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006846 encoding, unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006847 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006848 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006849
Petr Viktorinffd97532020-02-11 17:46:57 +01006850 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006851 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006852 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006853 if (!PyTuple_Check(restuple)) {
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006854 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00006855 Py_DECREF(restuple);
6856 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006857 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006858 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00006859 &resunicode, newpos)) {
6860 Py_DECREF(restuple);
6861 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006862 }
Martin v. Löwisdb12d452009-05-02 18:52:14 +00006863 if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6864 PyErr_SetString(PyExc_TypeError, &argparse[3]);
6865 Py_DECREF(restuple);
6866 return NULL;
6867 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006868 if (*newpos<0)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006869 *newpos = len + *newpos;
6870 if (*newpos<0 || *newpos>len) {
Victor Stinnera33bce02014-07-04 22:47:46 +02006871 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00006872 Py_DECREF(restuple);
6873 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00006874 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006875 Py_INCREF(resunicode);
6876 Py_DECREF(restuple);
6877 return resunicode;
6878}
6879
Alexander Belopolsky40018472011-02-26 01:02:56 +00006880static PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006881unicode_encode_ucs1(PyObject *unicode,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03006882 const char *errors,
Victor Stinner0030cd52015-09-24 14:45:00 +02006883 const Py_UCS4 limit)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006884{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006885 /* input state */
6886 Py_ssize_t pos=0, size;
6887 int kind;
6888 void *data;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006889 /* pointer into the output */
6890 char *str;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00006891 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6892 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
Victor Stinner50149202015-09-22 00:26:54 +02006893 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006894 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02006895 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner6bd525b2015-10-09 13:10:05 +02006896 PyObject *rep = NULL;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006897 /* output object */
6898 _PyBytesWriter writer;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006899
Benjamin Petersonbac79492012-01-14 13:34:47 -05006900 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006901 return NULL;
6902 size = PyUnicode_GET_LENGTH(unicode);
6903 kind = PyUnicode_KIND(unicode);
6904 data = PyUnicode_DATA(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006905 /* allocate enough for a simple encoding without
6906 replacements, if we need more, we'll resize */
Guido van Rossum98297ee2007-11-06 21:34:58 +00006907 if (size == 0)
Christian Heimes72b710a2008-05-26 13:28:38 +00006908 return PyBytes_FromStringAndSize(NULL, 0);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006909
6910 _PyBytesWriter_Init(&writer);
6911 str = _PyBytesWriter_Alloc(&writer, size);
6912 if (str == NULL)
Guido van Rossum98297ee2007-11-06 21:34:58 +00006913 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006914
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006915 while (pos < size) {
Victor Stinner0030cd52015-09-24 14:45:00 +02006916 Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00006917
Benjamin Peterson29060642009-01-31 22:14:21 +00006918 /* can we encode this? */
Victor Stinner0030cd52015-09-24 14:45:00 +02006919 if (ch < limit) {
Benjamin Peterson29060642009-01-31 22:14:21 +00006920 /* no overflow check, because we know that the space is enough */
Victor Stinner0030cd52015-09-24 14:45:00 +02006921 *str++ = (char)ch;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006922 ++pos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00006923 }
Benjamin Peterson29060642009-01-31 22:14:21 +00006924 else {
Victor Stinner6bd525b2015-10-09 13:10:05 +02006925 Py_ssize_t newpos, i;
Benjamin Peterson29060642009-01-31 22:14:21 +00006926 /* startpos for collecting unencodable chars */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006927 Py_ssize_t collstart = pos;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006928 Py_ssize_t collend = collstart + 1;
Benjamin Peterson29060642009-01-31 22:14:21 +00006929 /* find all unecodable characters */
Victor Stinner50149202015-09-22 00:26:54 +02006930
Benjamin Petersona1c1be42014-09-29 18:18:57 -04006931 while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
Benjamin Peterson29060642009-01-31 22:14:21 +00006932 ++collend;
Victor Stinner50149202015-09-22 00:26:54 +02006933
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006934 /* Only overallocate the buffer if it's not the last write */
6935 writer.overallocate = (collend < size);
6936
Benjamin Peterson29060642009-01-31 22:14:21 +00006937 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02006938 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02006939 error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02006940
6941 switch (error_handler) {
6942 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01006943 raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00006944 goto onError;
Victor Stinner50149202015-09-22 00:26:54 +02006945
6946 case _Py_ERROR_REPLACE:
Victor Stinner01ada392015-10-01 21:54:51 +02006947 memset(str, '?', collend - collstart);
6948 str += (collend - collstart);
Stefan Krahf432a322017-08-21 13:09:59 +02006949 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02006950 case _Py_ERROR_IGNORE:
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006951 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006952 break;
Victor Stinner50149202015-09-22 00:26:54 +02006953
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006954 case _Py_ERROR_BACKSLASHREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006955 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006956 writer.min_size -= (collend - collstart);
6957 str = backslashreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006958 unicode, collstart, collend);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006959 if (str == NULL)
6960 goto onError;
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006961 pos = collend;
6962 break;
Victor Stinnerfdfbf782015-10-09 00:33:49 +02006963
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006964 case _Py_ERROR_XMLCHARREFREPLACE:
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006965 /* subtract preallocated bytes */
Victor Stinnerad771582015-10-09 12:38:53 +02006966 writer.min_size -= (collend - collstart);
6967 str = xmlcharrefreplace(&writer, str,
Victor Stinnere7bf86c2015-10-09 01:39:28 +02006968 unicode, collstart, collend);
6969 if (str == NULL)
6970 goto onError;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01006971 pos = collend;
Benjamin Peterson29060642009-01-31 22:14:21 +00006972 break;
Victor Stinner50149202015-09-22 00:26:54 +02006973
Victor Stinnerc3713e92015-09-29 12:32:13 +02006974 case _Py_ERROR_SURROGATEESCAPE:
6975 for (i = collstart; i < collend; ++i) {
6976 ch = PyUnicode_READ(kind, data, i);
6977 if (ch < 0xdc80 || 0xdcff < ch) {
6978 /* Not a UTF-8b surrogate */
6979 break;
6980 }
6981 *str++ = (char)(ch - 0xdc00);
6982 ++pos;
6983 }
6984 if (i >= collend)
6985 break;
6986 collstart = pos;
6987 assert(collstart != collend);
Stefan Krahf432a322017-08-21 13:09:59 +02006988 /* fall through */
Victor Stinnerc3713e92015-09-29 12:32:13 +02006989
Benjamin Peterson29060642009-01-31 22:14:21 +00006990 default:
Victor Stinner6bd525b2015-10-09 13:10:05 +02006991 rep = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6992 encoding, reason, unicode, &exc,
6993 collstart, collend, &newpos);
6994 if (rep == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00006995 goto onError;
Victor Stinner0030cd52015-09-24 14:45:00 +02006996
Raymond Hettinger15f44ab2016-08-30 10:47:49 -07006997 /* subtract preallocated bytes */
Xiang Zhangd04d8472016-11-23 19:34:01 +08006998 writer.min_size -= newpos - collstart;
Victor Stinnerad771582015-10-09 12:38:53 +02006999
Victor Stinner6bd525b2015-10-09 13:10:05 +02007000 if (PyBytes_Check(rep)) {
Martin v. Löwis011e8422009-05-05 04:43:17 +00007001 /* Directly copy bytes result to output. */
Victor Stinnerce179bf2015-10-09 12:57:22 +02007002 str = _PyBytesWriter_WriteBytes(&writer, str,
Victor Stinner6bd525b2015-10-09 13:10:05 +02007003 PyBytes_AS_STRING(rep),
7004 PyBytes_GET_SIZE(rep));
Martin v. Löwisdb12d452009-05-02 18:52:14 +00007005 }
Victor Stinner6bd525b2015-10-09 13:10:05 +02007006 else {
7007 assert(PyUnicode_Check(rep));
Victor Stinner0030cd52015-09-24 14:45:00 +02007008
Victor Stinner6bd525b2015-10-09 13:10:05 +02007009 if (PyUnicode_READY(rep) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00007010 goto onError;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007011
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007012 if (limit == 256 ?
7013 PyUnicode_KIND(rep) != PyUnicode_1BYTE_KIND :
7014 !PyUnicode_IS_ASCII(rep))
7015 {
7016 /* Not all characters are smaller than limit */
7017 raise_encode_exception(&exc, encoding, unicode,
7018 collstart, collend, reason);
7019 goto onError;
Benjamin Peterson29060642009-01-31 22:14:21 +00007020 }
Serhiy Storchaka99250d52016-11-23 15:13:00 +02007021 assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
7022 str = _PyBytesWriter_WriteBytes(&writer, str,
7023 PyUnicode_DATA(rep),
7024 PyUnicode_GET_LENGTH(rep));
Benjamin Peterson29060642009-01-31 22:14:21 +00007025 }
Alexey Izbyshev74a307d2018-08-19 21:52:04 +03007026 if (str == NULL)
7027 goto onError;
7028
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007029 pos = newpos;
Victor Stinner6bd525b2015-10-09 13:10:05 +02007030 Py_CLEAR(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007031 }
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007032
7033 /* If overallocation was disabled, ensure that it was the last
7034 write. Otherwise, we missed an optimization */
7035 assert(writer.overallocate || pos == size);
Benjamin Peterson14339b62009-01-31 16:36:08 +00007036 }
7037 }
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007038
Victor Stinner50149202015-09-22 00:26:54 +02007039 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007040 Py_XDECREF(exc);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007041 return _PyBytesWriter_Finish(&writer, str);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007042
7043 onError:
Victor Stinner6bd525b2015-10-09 13:10:05 +02007044 Py_XDECREF(rep);
Victor Stinnerfdfbf782015-10-09 00:33:49 +02007045 _PyBytesWriter_Dealloc(&writer);
Victor Stinner50149202015-09-22 00:26:54 +02007046 Py_XDECREF(error_handler_obj);
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00007047 Py_XDECREF(exc);
7048 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007049}
7050
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007051/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007052PyObject *
7053PyUnicode_EncodeLatin1(const Py_UNICODE *p,
Ezio Melotti2aa2b3b2011-09-29 00:58:57 +03007054 Py_ssize_t size,
7055 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007056{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007057 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007058 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007059 if (unicode == NULL)
7060 return NULL;
7061 result = unicode_encode_ucs1(unicode, errors, 256);
7062 Py_DECREF(unicode);
7063 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007064}
7065
Alexander Belopolsky40018472011-02-26 01:02:56 +00007066PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007067_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007068{
7069 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007070 PyErr_BadArgument();
7071 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007072 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007073 if (PyUnicode_READY(unicode) == -1)
7074 return NULL;
7075 /* Fast path: if it is a one-byte string, construct
7076 bytes object directly. */
7077 if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
7078 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7079 PyUnicode_GET_LENGTH(unicode));
7080 /* Non-Latin-1 characters present. Defer to above function to
7081 raise the exception. */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007082 return unicode_encode_ucs1(unicode, errors, 256);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007083}
7084
7085PyObject*
7086PyUnicode_AsLatin1String(PyObject *unicode)
7087{
7088 return _PyUnicode_AsLatin1String(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007089}
7090
7091/* --- 7-bit ASCII Codec -------------------------------------------------- */
7092
Alexander Belopolsky40018472011-02-26 01:02:56 +00007093PyObject *
7094PyUnicode_DecodeASCII(const char *s,
7095 Py_ssize_t size,
7096 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007097{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007098 const char *starts = s;
Inada Naoki770847a2019-06-24 12:30:24 +09007099 const char *e = s + size;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007100 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007101 PyObject *exc = NULL;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007102 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Tim Petersced69f82003-09-16 20:30:58 +00007103
Guido van Rossumd57fd912000-03-10 22:53:23 +00007104 if (size == 0)
Serhiy Storchaka678db842013-01-26 12:16:36 +02007105 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01007106
Guido van Rossumd57fd912000-03-10 22:53:23 +00007107 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
Victor Stinner702c7342011-10-05 13:50:52 +02007108 if (size == 1 && (unsigned char)s[0] < 128)
7109 return get_latin1_char((unsigned char)s[0]);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007110
Inada Naoki770847a2019-06-24 12:30:24 +09007111 // Shortcut for simple case
7112 PyObject *u = PyUnicode_New(size, 127);
7113 if (u == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +02007114 return NULL;
Inada Naoki770847a2019-06-24 12:30:24 +09007115 }
7116 Py_ssize_t outpos = ascii_decode(s, e, PyUnicode_DATA(u));
7117 if (outpos == size) {
7118 return u;
7119 }
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007120
Inada Naoki770847a2019-06-24 12:30:24 +09007121 _PyUnicodeWriter writer;
7122 _PyUnicodeWriter_InitWithBuffer(&writer, u);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007123 writer.pos = outpos;
Antoine Pitrouca5f91b2012-05-10 16:36:02 +02007124
Inada Naoki770847a2019-06-24 12:30:24 +09007125 s += outpos;
7126 int kind = writer.kind;
7127 void *data = writer.data;
7128 Py_ssize_t startinpos, endinpos;
7129
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007130 while (s < e) {
Antoine Pitrou9ed5f272013-08-13 20:18:52 +02007131 unsigned char c = (unsigned char)*s;
Benjamin Peterson29060642009-01-31 22:14:21 +00007132 if (c < 128) {
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007133 PyUnicode_WRITE(kind, data, writer.pos, c);
7134 writer.pos++;
Benjamin Peterson29060642009-01-31 22:14:21 +00007135 ++s;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007136 continue;
Benjamin Peterson29060642009-01-31 22:14:21 +00007137 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007138
7139 /* byte outsize range 0x00..0x7f: call the error handler */
7140
7141 if (error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02007142 error_handler = _Py_GetErrorHandler(errors);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007143
7144 switch (error_handler)
7145 {
7146 case _Py_ERROR_REPLACE:
7147 case _Py_ERROR_SURROGATEESCAPE:
7148 /* Fast-path: the error handler only writes one character,
Victor Stinnerca9381e2015-09-22 00:58:32 +02007149 but we may switch to UCS2 at the first write */
7150 if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
7151 goto onError;
7152 kind = writer.kind;
7153 data = writer.data;
Victor Stinnerf96418d2015-09-21 23:06:27 +02007154
7155 if (error_handler == _Py_ERROR_REPLACE)
7156 PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
7157 else
7158 PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
7159 writer.pos++;
7160 ++s;
7161 break;
7162
7163 case _Py_ERROR_IGNORE:
7164 ++s;
7165 break;
7166
7167 default:
Benjamin Peterson29060642009-01-31 22:14:21 +00007168 startinpos = s-starts;
7169 endinpos = startinpos + 1;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007170 if (unicode_decode_call_errorhandler_writer(
Victor Stinnerf96418d2015-09-21 23:06:27 +02007171 errors, &error_handler_obj,
Benjamin Peterson29060642009-01-31 22:14:21 +00007172 "ascii", "ordinal not in range(128)",
7173 &starts, &e, &startinpos, &endinpos, &exc, &s,
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007174 &writer))
Benjamin Peterson29060642009-01-31 22:14:21 +00007175 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007176 kind = writer.kind;
7177 data = writer.data;
Benjamin Peterson29060642009-01-31 22:14:21 +00007178 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00007179 }
Victor Stinnerf96418d2015-09-21 23:06:27 +02007180 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007181 Py_XDECREF(exc);
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007182 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00007183
Benjamin Peterson29060642009-01-31 22:14:21 +00007184 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007185 _PyUnicodeWriter_Dealloc(&writer);
Victor Stinnerf96418d2015-09-21 23:06:27 +02007186 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00007187 Py_XDECREF(exc);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007188 return NULL;
7189}
7190
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007191/* Deprecated */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007192PyObject *
7193PyUnicode_EncodeASCII(const Py_UNICODE *p,
7194 Py_ssize_t size,
7195 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007196{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007197 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007198 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007199 if (unicode == NULL)
7200 return NULL;
7201 result = unicode_encode_ucs1(unicode, errors, 128);
7202 Py_DECREF(unicode);
7203 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007204}
7205
Alexander Belopolsky40018472011-02-26 01:02:56 +00007206PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007207_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00007208{
7209 if (!PyUnicode_Check(unicode)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007210 PyErr_BadArgument();
7211 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00007212 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007213 if (PyUnicode_READY(unicode) == -1)
7214 return NULL;
7215 /* Fast path: if it is an ASCII-only string, construct bytes object
7216 directly. Else defer to above function to raise the exception. */
Victor Stinneraf037572013-04-14 18:44:10 +02007217 if (PyUnicode_IS_ASCII(unicode))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007218 return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
7219 PyUnicode_GET_LENGTH(unicode));
Martin v. Löwis23e275b2011-11-02 18:02:51 +01007220 return unicode_encode_ucs1(unicode, errors, 128);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02007221}
7222
7223PyObject *
7224PyUnicode_AsASCIIString(PyObject *unicode)
7225{
7226 return _PyUnicode_AsASCIIString(unicode, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00007227}
7228
Steve Dowercc16be82016-09-08 10:35:16 -07007229#ifdef MS_WINDOWS
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007230
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007231/* --- MBCS codecs for Windows -------------------------------------------- */
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007232
Hirokazu Yamamoto35302462009-03-21 13:23:27 +00007233#if SIZEOF_INT < SIZEOF_SIZE_T
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007234#define NEED_RETRY
7235#endif
7236
Steve Dower7ebdda02019-08-21 16:22:33 -07007237/* INT_MAX is the theoretical largest chunk (or INT_MAX / 2 when
7238 transcoding from UTF-16), but INT_MAX / 4 perfoms better in
7239 both cases also and avoids partial characters overrunning the
7240 length limit in MultiByteToWideChar on Windows */
7241#define DECODING_CHUNK_SIZE (INT_MAX/4)
7242
Victor Stinner3a50e702011-10-18 21:21:00 +02007243#ifndef WC_ERR_INVALID_CHARS
7244# define WC_ERR_INVALID_CHARS 0x0080
7245#endif
7246
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007247static const char*
Victor Stinner3a50e702011-10-18 21:21:00 +02007248code_page_name(UINT code_page, PyObject **obj)
7249{
7250 *obj = NULL;
7251 if (code_page == CP_ACP)
7252 return "mbcs";
7253 if (code_page == CP_UTF7)
7254 return "CP_UTF7";
7255 if (code_page == CP_UTF8)
7256 return "CP_UTF8";
7257
7258 *obj = PyBytes_FromFormat("cp%u", code_page);
7259 if (*obj == NULL)
7260 return NULL;
7261 return PyBytes_AS_STRING(*obj);
7262}
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007263
Victor Stinner3a50e702011-10-18 21:21:00 +02007264static DWORD
7265decode_code_page_flags(UINT code_page)
7266{
7267 if (code_page == CP_UTF7) {
7268 /* The CP_UTF7 decoder only supports flags=0 */
7269 return 0;
7270 }
7271 else
7272 return MB_ERR_INVALID_CHARS;
7273}
7274
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007275/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007276 * Decode a byte string from a Windows code page into unicode object in strict
7277 * mode.
7278 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007279 * Returns consumed size if succeed, returns -2 on decode error, or raise an
7280 * OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007281 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007282static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007283decode_code_page_strict(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007284 wchar_t **buf,
7285 Py_ssize_t *bufsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007286 const char *in,
7287 int insize)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007288{
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007289 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner24729f32011-11-10 20:31:37 +01007290 wchar_t *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007291 DWORD outsize;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007292
7293 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007294 assert(insize > 0);
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007295 while ((outsize = MultiByteToWideChar(code_page, flags,
7296 in, insize, NULL, 0)) <= 0)
7297 {
7298 if (!flags || GetLastError() != ERROR_INVALID_FLAGS) {
7299 goto error;
7300 }
7301 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7302 flags = 0;
7303 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007304
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007305 /* Extend a wchar_t* buffer */
7306 Py_ssize_t n = *bufsize; /* Get the current length */
7307 if (widechar_resize(buf, bufsize, n + outsize) < 0) {
7308 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007309 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007310 out = *buf + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007311
7312 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007313 outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
7314 if (outsize <= 0)
7315 goto error;
7316 return insize;
Victor Stinner554f3f02010-06-16 23:33:54 +00007317
Victor Stinner3a50e702011-10-18 21:21:00 +02007318error:
7319 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7320 return -2;
7321 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007322 return -1;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007323}
7324
Victor Stinner3a50e702011-10-18 21:21:00 +02007325/*
7326 * Decode a byte string from a code page into unicode object with an error
7327 * handler.
7328 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007329 * Returns consumed size if succeed, or raise an OSError or
Victor Stinner3a50e702011-10-18 21:21:00 +02007330 * UnicodeDecodeError exception and returns -1 on error.
7331 */
7332static int
7333decode_code_page_errors(UINT code_page,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007334 wchar_t **buf,
7335 Py_ssize_t *bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007336 const char *in, const int size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007337 const char *errors, int final)
Victor Stinner3a50e702011-10-18 21:21:00 +02007338{
7339 const char *startin = in;
7340 const char *endin = in + size;
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007341 DWORD flags = MB_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007342 /* Ideally, we should get reason from FormatMessage. This is the Windows
7343 2000 English version of the message. */
7344 const char *reason = "No mapping for the Unicode character exists "
7345 "in the target code page.";
7346 /* each step cannot decode more than 1 character, but a character can be
7347 represented as a surrogate pair */
Serhiy Storchaka4013c172018-12-03 10:36:45 +02007348 wchar_t buffer[2], *out;
Victor Stinner9f067f42013-06-05 00:21:31 +02007349 int insize;
7350 Py_ssize_t outsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007351 PyObject *errorHandler = NULL;
7352 PyObject *exc = NULL;
7353 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007354 const char *encoding;
Victor Stinner3a50e702011-10-18 21:21:00 +02007355 DWORD err;
7356 int ret = -1;
7357
7358 assert(size > 0);
7359
7360 encoding = code_page_name(code_page, &encoding_obj);
7361 if (encoding == NULL)
7362 return -1;
7363
Victor Stinner7d00cc12014-03-17 23:08:06 +01007364 if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007365 /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
7366 UnicodeDecodeError. */
7367 make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
7368 if (exc != NULL) {
7369 PyCodec_StrictErrors(exc);
7370 Py_CLEAR(exc);
7371 }
7372 goto error;
7373 }
7374
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007375 /* Extend a wchar_t* buffer */
7376 Py_ssize_t n = *bufsize; /* Get the current length */
7377 if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7378 PyErr_NoMemory();
7379 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007380 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007381 if (widechar_resize(buf, bufsize, n + size * Py_ARRAY_LENGTH(buffer)) < 0) {
7382 goto error;
Victor Stinner3a50e702011-10-18 21:21:00 +02007383 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007384 out = *buf + n;
Victor Stinner3a50e702011-10-18 21:21:00 +02007385
7386 /* Decode the byte string character per character */
Victor Stinner3a50e702011-10-18 21:21:00 +02007387 while (in < endin)
7388 {
7389 /* Decode a character */
7390 insize = 1;
7391 do
7392 {
7393 outsize = MultiByteToWideChar(code_page, flags,
7394 in, insize,
7395 buffer, Py_ARRAY_LENGTH(buffer));
7396 if (outsize > 0)
7397 break;
7398 err = GetLastError();
Serhiy Storchakac1e2c282019-03-20 21:45:18 +02007399 if (err == ERROR_INVALID_FLAGS && flags) {
7400 /* For some code pages (e.g. UTF-7) flags must be set to 0. */
7401 flags = 0;
7402 continue;
7403 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007404 if (err != ERROR_NO_UNICODE_TRANSLATION
7405 && err != ERROR_INSUFFICIENT_BUFFER)
7406 {
7407 PyErr_SetFromWindowsErr(0);
7408 goto error;
7409 }
7410 insize++;
7411 }
7412 /* 4=maximum length of a UTF-8 sequence */
7413 while (insize <= 4 && (in + insize) <= endin);
7414
7415 if (outsize <= 0) {
7416 Py_ssize_t startinpos, endinpos, outpos;
7417
Victor Stinner7d00cc12014-03-17 23:08:06 +01007418 /* last character in partial decode? */
7419 if (in + insize >= endin && !final)
7420 break;
7421
Victor Stinner3a50e702011-10-18 21:21:00 +02007422 startinpos = in - startin;
7423 endinpos = startinpos + 1;
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007424 outpos = out - *buf;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01007425 if (unicode_decode_call_errorhandler_wchar(
Victor Stinner3a50e702011-10-18 21:21:00 +02007426 errors, &errorHandler,
7427 encoding, reason,
7428 &startin, &endin, &startinpos, &endinpos, &exc, &in,
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007429 buf, bufsize, &outpos))
Victor Stinner3a50e702011-10-18 21:21:00 +02007430 {
7431 goto error;
7432 }
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007433 out = *buf + outpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007434 }
7435 else {
7436 in += insize;
7437 memcpy(out, buffer, outsize * sizeof(wchar_t));
7438 out += outsize;
7439 }
7440 }
7441
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007442 /* Shrink the buffer */
7443 assert(out - *buf <= *bufsize);
7444 *bufsize = out - *buf;
Victor Stinnere1f17c62014-07-25 14:03:03 +02007445 /* (in - startin) <= size and size is an int */
7446 ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
Victor Stinner3a50e702011-10-18 21:21:00 +02007447
7448error:
7449 Py_XDECREF(encoding_obj);
7450 Py_XDECREF(errorHandler);
7451 Py_XDECREF(exc);
7452 return ret;
7453}
7454
Victor Stinner3a50e702011-10-18 21:21:00 +02007455static PyObject *
7456decode_code_page_stateful(int code_page,
Victor Stinner76a31a62011-11-04 00:05:13 +01007457 const char *s, Py_ssize_t size,
7458 const char *errors, Py_ssize_t *consumed)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007459{
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007460 wchar_t *buf = NULL;
7461 Py_ssize_t bufsize = 0;
Victor Stinner76a31a62011-11-04 00:05:13 +01007462 int chunk_size, final, converted, done;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007463
Victor Stinner3a50e702011-10-18 21:21:00 +02007464 if (code_page < 0) {
7465 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7466 return NULL;
7467 }
Serhiy Storchaka64e461b2017-07-11 06:55:25 +03007468 if (size < 0) {
7469 PyErr_BadInternalCall();
7470 return NULL;
7471 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007472
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007473 if (consumed)
Benjamin Peterson29060642009-01-31 22:14:21 +00007474 *consumed = 0;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007475
Victor Stinner76a31a62011-11-04 00:05:13 +01007476 do
7477 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007478#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007479 if (size > DECODING_CHUNK_SIZE) {
7480 chunk_size = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007481 final = 0;
7482 done = 0;
7483 }
7484 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007485#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007486 {
7487 chunk_size = (int)size;
7488 final = (consumed == NULL);
7489 done = 1;
7490 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007491
Victor Stinner76a31a62011-11-04 00:05:13 +01007492 if (chunk_size == 0 && done) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007493 if (buf != NULL)
Victor Stinner76a31a62011-11-04 00:05:13 +01007494 break;
Serhiy Storchaka678db842013-01-26 12:16:36 +02007495 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner76a31a62011-11-04 00:05:13 +01007496 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007497
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007498 converted = decode_code_page_strict(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007499 s, chunk_size);
7500 if (converted == -2)
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007501 converted = decode_code_page_errors(code_page, &buf, &bufsize,
Victor Stinner76a31a62011-11-04 00:05:13 +01007502 s, chunk_size,
Victor Stinner7d00cc12014-03-17 23:08:06 +01007503 errors, final);
7504 assert(converted != 0 || done);
Victor Stinner76a31a62011-11-04 00:05:13 +01007505
7506 if (converted < 0) {
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007507 PyMem_Free(buf);
Victor Stinner76a31a62011-11-04 00:05:13 +01007508 return NULL;
7509 }
7510
7511 if (consumed)
7512 *consumed += converted;
7513
7514 s += converted;
7515 size -= converted;
7516 } while (!done);
Victor Stinner3a50e702011-10-18 21:21:00 +02007517
Serhiy Storchakaeeb719e2018-12-04 10:25:50 +02007518 PyObject *v = PyUnicode_FromWideChar(buf, bufsize);
7519 PyMem_Free(buf);
7520 return v;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007521}
7522
Alexander Belopolsky40018472011-02-26 01:02:56 +00007523PyObject *
Victor Stinner3a50e702011-10-18 21:21:00 +02007524PyUnicode_DecodeCodePageStateful(int code_page,
7525 const char *s,
7526 Py_ssize_t size,
7527 const char *errors,
7528 Py_ssize_t *consumed)
7529{
7530 return decode_code_page_stateful(code_page, s, size, errors, consumed);
7531}
7532
7533PyObject *
7534PyUnicode_DecodeMBCSStateful(const char *s,
7535 Py_ssize_t size,
7536 const char *errors,
7537 Py_ssize_t *consumed)
7538{
7539 return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7540}
7541
7542PyObject *
Alexander Belopolsky40018472011-02-26 01:02:56 +00007543PyUnicode_DecodeMBCS(const char *s,
7544 Py_ssize_t size,
7545 const char *errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007546{
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007547 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7548}
7549
Victor Stinner3a50e702011-10-18 21:21:00 +02007550static DWORD
7551encode_code_page_flags(UINT code_page, const char *errors)
7552{
7553 if (code_page == CP_UTF8) {
Steve Dower3e96f322015-03-02 08:01:10 -08007554 return WC_ERR_INVALID_CHARS;
Victor Stinner3a50e702011-10-18 21:21:00 +02007555 }
7556 else if (code_page == CP_UTF7) {
7557 /* CP_UTF7 only supports flags=0 */
7558 return 0;
7559 }
7560 else {
7561 if (errors != NULL && strcmp(errors, "replace") == 0)
7562 return 0;
7563 else
7564 return WC_NO_BEST_FIT_CHARS;
7565 }
7566}
7567
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007568/*
Victor Stinner3a50e702011-10-18 21:21:00 +02007569 * Encode a Unicode string to a Windows code page into a byte string in strict
7570 * mode.
7571 *
7572 * Returns consumed characters if succeed, returns -2 on encode error, or raise
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007573 * an OSError and returns -1 on other error.
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007574 */
Alexander Belopolsky40018472011-02-26 01:02:56 +00007575static int
Victor Stinner3a50e702011-10-18 21:21:00 +02007576encode_code_page_strict(UINT code_page, PyObject **outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007577 PyObject *unicode, Py_ssize_t offset, int len,
Victor Stinner3a50e702011-10-18 21:21:00 +02007578 const char* errors)
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007579{
Victor Stinner554f3f02010-06-16 23:33:54 +00007580 BOOL usedDefaultChar = FALSE;
Victor Stinner3a50e702011-10-18 21:21:00 +02007581 BOOL *pusedDefaultChar = &usedDefaultChar;
7582 int outsize;
Victor Stinner24729f32011-11-10 20:31:37 +01007583 wchar_t *p;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007584 Py_ssize_t size;
Victor Stinner3a50e702011-10-18 21:21:00 +02007585 const DWORD flags = encode_code_page_flags(code_page, NULL);
7586 char *out;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007587 /* Create a substring so that we can get the UTF-16 representation
7588 of just the slice under consideration. */
7589 PyObject *substring;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007590
Martin v. Löwis3d325192011-11-04 18:23:06 +01007591 assert(len > 0);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007592
Victor Stinner3a50e702011-10-18 21:21:00 +02007593 if (code_page != CP_UTF8 && code_page != CP_UTF7)
Victor Stinner554f3f02010-06-16 23:33:54 +00007594 pusedDefaultChar = &usedDefaultChar;
Victor Stinner3a50e702011-10-18 21:21:00 +02007595 else
Victor Stinner554f3f02010-06-16 23:33:54 +00007596 pusedDefaultChar = NULL;
Victor Stinner554f3f02010-06-16 23:33:54 +00007597
Victor Stinner2fc507f2011-11-04 20:06:39 +01007598 substring = PyUnicode_Substring(unicode, offset, offset+len);
7599 if (substring == NULL)
7600 return -1;
7601 p = PyUnicode_AsUnicodeAndSize(substring, &size);
7602 if (p == NULL) {
7603 Py_DECREF(substring);
7604 return -1;
7605 }
Victor Stinner9f067f42013-06-05 00:21:31 +02007606 assert(size <= INT_MAX);
Martin v. Löwis3d325192011-11-04 18:23:06 +01007607
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007608 /* First get the size of the result */
Victor Stinner3a50e702011-10-18 21:21:00 +02007609 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007610 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007611 NULL, 0,
7612 NULL, pusedDefaultChar);
7613 if (outsize <= 0)
7614 goto error;
7615 /* If we used a default char, then we failed! */
Victor Stinner2fc507f2011-11-04 20:06:39 +01007616 if (pusedDefaultChar && *pusedDefaultChar) {
7617 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007618 return -2;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007619 }
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007620
Victor Stinner3a50e702011-10-18 21:21:00 +02007621 if (*outbytes == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00007622 /* Create string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007623 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007624 if (*outbytes == NULL) {
7625 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007626 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007627 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007628 out = PyBytes_AS_STRING(*outbytes);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007629 }
7630 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00007631 /* Extend string object */
Victor Stinner3a50e702011-10-18 21:21:00 +02007632 const Py_ssize_t n = PyBytes_Size(*outbytes);
7633 if (outsize > PY_SSIZE_T_MAX - n) {
7634 PyErr_NoMemory();
Victor Stinner2fc507f2011-11-04 20:06:39 +01007635 Py_DECREF(substring);
Benjamin Peterson29060642009-01-31 22:14:21 +00007636 return -1;
Victor Stinner3a50e702011-10-18 21:21:00 +02007637 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007638 if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7639 Py_DECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007640 return -1;
Victor Stinner2fc507f2011-11-04 20:06:39 +01007641 }
Victor Stinner3a50e702011-10-18 21:21:00 +02007642 out = PyBytes_AS_STRING(*outbytes) + n;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007643 }
7644
7645 /* Do the conversion */
Victor Stinner3a50e702011-10-18 21:21:00 +02007646 outsize = WideCharToMultiByte(code_page, flags,
Victor Stinner9f067f42013-06-05 00:21:31 +02007647 p, (int)size,
Victor Stinner3a50e702011-10-18 21:21:00 +02007648 out, outsize,
7649 NULL, pusedDefaultChar);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007650 Py_CLEAR(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007651 if (outsize <= 0)
7652 goto error;
7653 if (pusedDefaultChar && *pusedDefaultChar)
7654 return -2;
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007655 return 0;
Victor Stinner554f3f02010-06-16 23:33:54 +00007656
Victor Stinner3a50e702011-10-18 21:21:00 +02007657error:
Victor Stinner2fc507f2011-11-04 20:06:39 +01007658 Py_XDECREF(substring);
Victor Stinner3a50e702011-10-18 21:21:00 +02007659 if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7660 return -2;
7661 PyErr_SetFromWindowsErr(0);
Victor Stinner554f3f02010-06-16 23:33:54 +00007662 return -1;
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007663}
7664
Victor Stinner3a50e702011-10-18 21:21:00 +02007665/*
Serhiy Storchakad65c9492015-11-02 14:10:23 +02007666 * Encode a Unicode string to a Windows code page into a byte string using an
Victor Stinner3a50e702011-10-18 21:21:00 +02007667 * error handler.
7668 *
Andrew Svetlov2606a6f2012-12-19 14:33:35 +02007669 * Returns consumed characters if succeed, or raise an OSError and returns
Victor Stinner3a50e702011-10-18 21:21:00 +02007670 * -1 on other error.
7671 */
7672static int
7673encode_code_page_errors(UINT code_page, PyObject **outbytes,
Victor Stinner7581cef2011-11-03 22:32:33 +01007674 PyObject *unicode, Py_ssize_t unicode_offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007675 Py_ssize_t insize, const char* errors)
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007676{
Victor Stinner3a50e702011-10-18 21:21:00 +02007677 const DWORD flags = encode_code_page_flags(code_page, errors);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007678 Py_ssize_t pos = unicode_offset;
7679 Py_ssize_t endin = unicode_offset + insize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007680 /* Ideally, we should get reason from FormatMessage. This is the Windows
7681 2000 English version of the message. */
7682 const char *reason = "invalid character";
7683 /* 4=maximum length of a UTF-8 sequence */
7684 char buffer[4];
7685 BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7686 Py_ssize_t outsize;
7687 char *out;
Victor Stinner3a50e702011-10-18 21:21:00 +02007688 PyObject *errorHandler = NULL;
7689 PyObject *exc = NULL;
7690 PyObject *encoding_obj = NULL;
Serhiy Storchakaef1585e2015-12-25 20:01:53 +02007691 const char *encoding;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007692 Py_ssize_t newpos, newoutsize;
Victor Stinner3a50e702011-10-18 21:21:00 +02007693 PyObject *rep;
7694 int ret = -1;
7695
7696 assert(insize > 0);
7697
7698 encoding = code_page_name(code_page, &encoding_obj);
7699 if (encoding == NULL)
7700 return -1;
7701
7702 if (errors == NULL || strcmp(errors, "strict") == 0) {
7703 /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7704 then we raise a UnicodeEncodeError. */
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007705 make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
Victor Stinner3a50e702011-10-18 21:21:00 +02007706 if (exc != NULL) {
7707 PyCodec_StrictErrors(exc);
7708 Py_DECREF(exc);
7709 }
7710 Py_XDECREF(encoding_obj);
7711 return -1;
7712 }
7713
7714 if (code_page != CP_UTF8 && code_page != CP_UTF7)
7715 pusedDefaultChar = &usedDefaultChar;
7716 else
7717 pusedDefaultChar = NULL;
7718
7719 if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7720 PyErr_NoMemory();
7721 goto error;
7722 }
7723 outsize = insize * Py_ARRAY_LENGTH(buffer);
7724
7725 if (*outbytes == NULL) {
7726 /* Create string object */
7727 *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7728 if (*outbytes == NULL)
7729 goto error;
7730 out = PyBytes_AS_STRING(*outbytes);
7731 }
7732 else {
7733 /* Extend string object */
7734 Py_ssize_t n = PyBytes_Size(*outbytes);
7735 if (n > PY_SSIZE_T_MAX - outsize) {
7736 PyErr_NoMemory();
7737 goto error;
7738 }
7739 if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7740 goto error;
7741 out = PyBytes_AS_STRING(*outbytes) + n;
7742 }
7743
7744 /* Encode the string character per character */
Martin v. Löwis3d325192011-11-04 18:23:06 +01007745 while (pos < endin)
Victor Stinner3a50e702011-10-18 21:21:00 +02007746 {
Victor Stinner2fc507f2011-11-04 20:06:39 +01007747 Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7748 wchar_t chars[2];
7749 int charsize;
7750 if (ch < 0x10000) {
7751 chars[0] = (wchar_t)ch;
7752 charsize = 1;
7753 }
7754 else {
Victor Stinner76df43d2012-10-30 01:42:39 +01007755 chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7756 chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
Victor Stinner2fc507f2011-11-04 20:06:39 +01007757 charsize = 2;
7758 }
7759
Victor Stinner3a50e702011-10-18 21:21:00 +02007760 outsize = WideCharToMultiByte(code_page, flags,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007761 chars, charsize,
Victor Stinner3a50e702011-10-18 21:21:00 +02007762 buffer, Py_ARRAY_LENGTH(buffer),
7763 NULL, pusedDefaultChar);
7764 if (outsize > 0) {
7765 if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7766 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007767 pos++;
Victor Stinner3a50e702011-10-18 21:21:00 +02007768 memcpy(out, buffer, outsize);
7769 out += outsize;
7770 continue;
7771 }
7772 }
7773 else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7774 PyErr_SetFromWindowsErr(0);
7775 goto error;
7776 }
7777
Victor Stinner3a50e702011-10-18 21:21:00 +02007778 rep = unicode_encode_call_errorhandler(
7779 errors, &errorHandler, encoding, reason,
Victor Stinner7581cef2011-11-03 22:32:33 +01007780 unicode, &exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007781 pos, pos + 1, &newpos);
Victor Stinner3a50e702011-10-18 21:21:00 +02007782 if (rep == NULL)
7783 goto error;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007784 pos = newpos;
Victor Stinner3a50e702011-10-18 21:21:00 +02007785
7786 if (PyBytes_Check(rep)) {
7787 outsize = PyBytes_GET_SIZE(rep);
7788 if (outsize != 1) {
7789 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7790 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7791 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7792 Py_DECREF(rep);
7793 goto error;
7794 }
7795 out = PyBytes_AS_STRING(*outbytes) + offset;
7796 }
7797 memcpy(out, PyBytes_AS_STRING(rep), outsize);
7798 out += outsize;
7799 }
7800 else {
7801 Py_ssize_t i;
7802 enum PyUnicode_Kind kind;
7803 void *data;
7804
Benjamin Petersonbac79492012-01-14 13:34:47 -05007805 if (PyUnicode_READY(rep) == -1) {
Victor Stinner3a50e702011-10-18 21:21:00 +02007806 Py_DECREF(rep);
7807 goto error;
7808 }
7809
7810 outsize = PyUnicode_GET_LENGTH(rep);
7811 if (outsize != 1) {
7812 Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7813 newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7814 if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7815 Py_DECREF(rep);
7816 goto error;
7817 }
7818 out = PyBytes_AS_STRING(*outbytes) + offset;
7819 }
7820 kind = PyUnicode_KIND(rep);
7821 data = PyUnicode_DATA(rep);
7822 for (i=0; i < outsize; i++) {
7823 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7824 if (ch > 127) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01007825 raise_encode_exception(&exc,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007826 encoding, unicode,
7827 pos, pos + 1,
Victor Stinner3a50e702011-10-18 21:21:00 +02007828 "unable to encode error handler result to ASCII");
7829 Py_DECREF(rep);
7830 goto error;
7831 }
7832 *out = (unsigned char)ch;
7833 out++;
7834 }
7835 }
7836 Py_DECREF(rep);
7837 }
7838 /* write a NUL byte */
7839 *out = 0;
7840 outsize = out - PyBytes_AS_STRING(*outbytes);
7841 assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7842 if (_PyBytes_Resize(outbytes, outsize) < 0)
7843 goto error;
7844 ret = 0;
7845
7846error:
7847 Py_XDECREF(encoding_obj);
7848 Py_XDECREF(errorHandler);
7849 Py_XDECREF(exc);
7850 return ret;
7851}
7852
Victor Stinner3a50e702011-10-18 21:21:00 +02007853static PyObject *
7854encode_code_page(int code_page,
Victor Stinner7581cef2011-11-03 22:32:33 +01007855 PyObject *unicode,
Victor Stinner3a50e702011-10-18 21:21:00 +02007856 const char *errors)
7857{
Martin v. Löwis3d325192011-11-04 18:23:06 +01007858 Py_ssize_t len;
Victor Stinner3a50e702011-10-18 21:21:00 +02007859 PyObject *outbytes = NULL;
Victor Stinner7581cef2011-11-03 22:32:33 +01007860 Py_ssize_t offset;
Victor Stinner76a31a62011-11-04 00:05:13 +01007861 int chunk_len, ret, done;
Victor Stinner7581cef2011-11-03 22:32:33 +01007862
Victor Stinner29dacf22015-01-26 16:41:32 +01007863 if (!PyUnicode_Check(unicode)) {
7864 PyErr_BadArgument();
7865 return NULL;
7866 }
7867
Benjamin Petersonbac79492012-01-14 13:34:47 -05007868 if (PyUnicode_READY(unicode) == -1)
Victor Stinner2fc507f2011-11-04 20:06:39 +01007869 return NULL;
7870 len = PyUnicode_GET_LENGTH(unicode);
Guido van Rossum03e29f12000-05-04 15:52:20 +00007871
Victor Stinner3a50e702011-10-18 21:21:00 +02007872 if (code_page < 0) {
7873 PyErr_SetString(PyExc_ValueError, "invalid code page number");
7874 return NULL;
7875 }
7876
Martin v. Löwis3d325192011-11-04 18:23:06 +01007877 if (len == 0)
Victor Stinner76a31a62011-11-04 00:05:13 +01007878 return PyBytes_FromStringAndSize(NULL, 0);
7879
Victor Stinner7581cef2011-11-03 22:32:33 +01007880 offset = 0;
7881 do
7882 {
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007883#ifdef NEED_RETRY
Steve Dower7ebdda02019-08-21 16:22:33 -07007884 if (len > DECODING_CHUNK_SIZE) {
7885 chunk_len = DECODING_CHUNK_SIZE;
Victor Stinner76a31a62011-11-04 00:05:13 +01007886 done = 0;
7887 }
Victor Stinner7581cef2011-11-03 22:32:33 +01007888 else
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007889#endif
Victor Stinner76a31a62011-11-04 00:05:13 +01007890 {
Martin v. Löwis3d325192011-11-04 18:23:06 +01007891 chunk_len = (int)len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007892 done = 1;
7893 }
Victor Stinner2fc507f2011-11-04 20:06:39 +01007894
Victor Stinner76a31a62011-11-04 00:05:13 +01007895 ret = encode_code_page_strict(code_page, &outbytes,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007896 unicode, offset, chunk_len,
Victor Stinner76a31a62011-11-04 00:05:13 +01007897 errors);
7898 if (ret == -2)
7899 ret = encode_code_page_errors(code_page, &outbytes,
7900 unicode, offset,
Martin v. Löwis3d325192011-11-04 18:23:06 +01007901 chunk_len, errors);
Victor Stinner7581cef2011-11-03 22:32:33 +01007902 if (ret < 0) {
7903 Py_XDECREF(outbytes);
7904 return NULL;
7905 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007906
Victor Stinner7581cef2011-11-03 22:32:33 +01007907 offset += chunk_len;
Martin v. Löwis3d325192011-11-04 18:23:06 +01007908 len -= chunk_len;
Victor Stinner76a31a62011-11-04 00:05:13 +01007909 } while (!done);
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007910
Victor Stinner3a50e702011-10-18 21:21:00 +02007911 return outbytes;
7912}
7913
7914PyObject *
7915PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7916 Py_ssize_t size,
7917 const char *errors)
7918{
Victor Stinner7581cef2011-11-03 22:32:33 +01007919 PyObject *unicode, *res;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02007920 unicode = PyUnicode_FromWideChar(p, size);
Victor Stinner7581cef2011-11-03 22:32:33 +01007921 if (unicode == NULL)
7922 return NULL;
7923 res = encode_code_page(CP_ACP, unicode, errors);
7924 Py_DECREF(unicode);
7925 return res;
Victor Stinner3a50e702011-10-18 21:21:00 +02007926}
7927
7928PyObject *
7929PyUnicode_EncodeCodePage(int code_page,
7930 PyObject *unicode,
7931 const char *errors)
7932{
Victor Stinner7581cef2011-11-03 22:32:33 +01007933 return encode_code_page(code_page, unicode, errors);
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007934}
Guido van Rossum2ea3e142000-03-31 17:24:09 +00007935
Alexander Belopolsky40018472011-02-26 01:02:56 +00007936PyObject *
7937PyUnicode_AsMBCSString(PyObject *unicode)
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007938{
Victor Stinner7581cef2011-11-03 22:32:33 +01007939 return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
Mark Hammond0ccda1e2003-07-01 00:13:27 +00007940}
7941
Thomas Wouters0e3f5912006-08-11 14:57:12 +00007942#undef NEED_RETRY
7943
Steve Dowercc16be82016-09-08 10:35:16 -07007944#endif /* MS_WINDOWS */
Guido van Rossumb7a40ba2000-03-28 02:01:52 +00007945
Guido van Rossumd57fd912000-03-10 22:53:23 +00007946/* --- Character Mapping Codec -------------------------------------------- */
7947
Victor Stinnerfb161b12013-04-18 01:44:27 +02007948static int
7949charmap_decode_string(const char *s,
7950 Py_ssize_t size,
7951 PyObject *mapping,
7952 const char *errors,
7953 _PyUnicodeWriter *writer)
7954{
7955 const char *starts = s;
7956 const char *e;
7957 Py_ssize_t startinpos, endinpos;
7958 PyObject *errorHandler = NULL, *exc = NULL;
7959 Py_ssize_t maplen;
7960 enum PyUnicode_Kind mapkind;
7961 void *mapdata;
7962 Py_UCS4 x;
7963 unsigned char ch;
7964
7965 if (PyUnicode_READY(mapping) == -1)
7966 return -1;
7967
7968 maplen = PyUnicode_GET_LENGTH(mapping);
7969 mapdata = PyUnicode_DATA(mapping);
7970 mapkind = PyUnicode_KIND(mapping);
7971
7972 e = s + size;
7973
7974 if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7975 /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7976 * is disabled in encoding aliases, latin1 is preferred because
7977 * its implementation is faster. */
7978 Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7979 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7980 Py_UCS4 maxchar = writer->maxchar;
7981
7982 assert (writer->kind == PyUnicode_1BYTE_KIND);
7983 while (s < e) {
7984 ch = *s;
7985 x = mapdata_ucs1[ch];
7986 if (x > maxchar) {
7987 if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7988 goto onError;
7989 maxchar = writer->maxchar;
7990 outdata = (Py_UCS1 *)writer->data;
7991 }
7992 outdata[writer->pos] = x;
7993 writer->pos++;
7994 ++s;
7995 }
7996 return 0;
7997 }
7998
7999 while (s < e) {
8000 if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
8001 enum PyUnicode_Kind outkind = writer->kind;
8002 Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
8003 if (outkind == PyUnicode_1BYTE_KIND) {
8004 Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
8005 Py_UCS4 maxchar = writer->maxchar;
8006 while (s < e) {
8007 ch = *s;
8008 x = mapdata_ucs2[ch];
8009 if (x > maxchar)
8010 goto Error;
8011 outdata[writer->pos] = x;
8012 writer->pos++;
8013 ++s;
8014 }
8015 break;
8016 }
8017 else if (outkind == PyUnicode_2BYTE_KIND) {
8018 Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
8019 while (s < e) {
8020 ch = *s;
8021 x = mapdata_ucs2[ch];
8022 if (x == 0xFFFE)
8023 goto Error;
8024 outdata[writer->pos] = x;
8025 writer->pos++;
8026 ++s;
8027 }
8028 break;
8029 }
8030 }
8031 ch = *s;
8032
8033 if (ch < maplen)
8034 x = PyUnicode_READ(mapkind, mapdata, ch);
8035 else
8036 x = 0xfffe; /* invalid value */
8037Error:
8038 if (x == 0xfffe)
8039 {
8040 /* undefined mapping */
8041 startinpos = s-starts;
8042 endinpos = startinpos+1;
8043 if (unicode_decode_call_errorhandler_writer(
8044 errors, &errorHandler,
8045 "charmap", "character maps to <undefined>",
8046 &starts, &e, &startinpos, &endinpos, &exc, &s,
8047 writer)) {
8048 goto onError;
8049 }
8050 continue;
8051 }
8052
8053 if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
8054 goto onError;
8055 ++s;
8056 }
8057 Py_XDECREF(errorHandler);
8058 Py_XDECREF(exc);
8059 return 0;
8060
8061onError:
8062 Py_XDECREF(errorHandler);
8063 Py_XDECREF(exc);
8064 return -1;
8065}
8066
8067static int
8068charmap_decode_mapping(const char *s,
8069 Py_ssize_t size,
8070 PyObject *mapping,
8071 const char *errors,
8072 _PyUnicodeWriter *writer)
8073{
8074 const char *starts = s;
8075 const char *e;
8076 Py_ssize_t startinpos, endinpos;
8077 PyObject *errorHandler = NULL, *exc = NULL;
8078 unsigned char ch;
Victor Stinnerf4f24242013-05-07 01:01:31 +02008079 PyObject *key, *item = NULL;
Victor Stinnerfb161b12013-04-18 01:44:27 +02008080
8081 e = s + size;
8082
8083 while (s < e) {
8084 ch = *s;
8085
8086 /* Get mapping (char ordinal -> integer, Unicode char or None) */
8087 key = PyLong_FromLong((long)ch);
8088 if (key == NULL)
8089 goto onError;
8090
8091 item = PyObject_GetItem(mapping, key);
8092 Py_DECREF(key);
8093 if (item == NULL) {
8094 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8095 /* No mapping found means: mapping is undefined. */
8096 PyErr_Clear();
8097 goto Undefined;
8098 } else
8099 goto onError;
8100 }
8101
8102 /* Apply mapping */
8103 if (item == Py_None)
8104 goto Undefined;
8105 if (PyLong_Check(item)) {
8106 long value = PyLong_AS_LONG(item);
8107 if (value == 0xFFFE)
8108 goto Undefined;
8109 if (value < 0 || value > MAX_UNICODE) {
8110 PyErr_Format(PyExc_TypeError,
8111 "character mapping must be in range(0x%lx)",
8112 (unsigned long)MAX_UNICODE + 1);
8113 goto onError;
8114 }
8115
8116 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8117 goto onError;
8118 }
8119 else if (PyUnicode_Check(item)) {
8120 if (PyUnicode_READY(item) == -1)
8121 goto onError;
8122 if (PyUnicode_GET_LENGTH(item) == 1) {
8123 Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
8124 if (value == 0xFFFE)
8125 goto Undefined;
8126 if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
8127 goto onError;
8128 }
8129 else {
8130 writer->overallocate = 1;
8131 if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
8132 goto onError;
8133 }
8134 }
8135 else {
8136 /* wrong return value */
8137 PyErr_SetString(PyExc_TypeError,
8138 "character mapping must return integer, None or str");
8139 goto onError;
8140 }
8141 Py_CLEAR(item);
8142 ++s;
8143 continue;
8144
8145Undefined:
8146 /* undefined mapping */
8147 Py_CLEAR(item);
8148 startinpos = s-starts;
8149 endinpos = startinpos+1;
8150 if (unicode_decode_call_errorhandler_writer(
8151 errors, &errorHandler,
8152 "charmap", "character maps to <undefined>",
8153 &starts, &e, &startinpos, &endinpos, &exc, &s,
8154 writer)) {
8155 goto onError;
8156 }
8157 }
8158 Py_XDECREF(errorHandler);
8159 Py_XDECREF(exc);
8160 return 0;
8161
8162onError:
8163 Py_XDECREF(item);
8164 Py_XDECREF(errorHandler);
8165 Py_XDECREF(exc);
8166 return -1;
8167}
8168
Alexander Belopolsky40018472011-02-26 01:02:56 +00008169PyObject *
8170PyUnicode_DecodeCharmap(const char *s,
8171 Py_ssize_t size,
8172 PyObject *mapping,
8173 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008174{
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008175 _PyUnicodeWriter writer;
Tim Petersced69f82003-09-16 20:30:58 +00008176
Guido van Rossumd57fd912000-03-10 22:53:23 +00008177 /* Default to Latin-1 */
8178 if (mapping == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008179 return PyUnicode_DecodeLatin1(s, size, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008180
Guido van Rossumd57fd912000-03-10 22:53:23 +00008181 if (size == 0)
Serhiy Storchakaed3c4122013-01-26 12:18:17 +02008182 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner8f674cc2013-04-17 23:02:17 +02008183 _PyUnicodeWriter_Init(&writer);
Victor Stinner170ca6f2013-04-18 00:25:28 +02008184 writer.min_length = size;
8185 if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008186 goto onError;
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008187
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008188 if (PyUnicode_CheckExact(mapping)) {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008189 if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
8190 goto onError;
Walter Dörwaldd1c1e102005-10-06 20:29:57 +00008191 }
8192 else {
Victor Stinnerfb161b12013-04-18 01:44:27 +02008193 if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
8194 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008195 }
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008196 return _PyUnicodeWriter_Finish(&writer);
Tim Petersced69f82003-09-16 20:30:58 +00008197
Benjamin Peterson29060642009-01-31 22:14:21 +00008198 onError:
Victor Stinnerfc009ef2012-11-07 00:36:38 +01008199 _PyUnicodeWriter_Dealloc(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008200 return NULL;
8201}
8202
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008203/* Charmap encoding: the lookup table */
8204
Alexander Belopolsky40018472011-02-26 01:02:56 +00008205struct encoding_map {
Benjamin Peterson29060642009-01-31 22:14:21 +00008206 PyObject_HEAD
8207 unsigned char level1[32];
8208 int count2, count3;
8209 unsigned char level23[1];
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008210};
8211
8212static PyObject*
8213encoding_map_size(PyObject *obj, PyObject* args)
8214{
8215 struct encoding_map *map = (struct encoding_map*)obj;
Benjamin Peterson14339b62009-01-31 16:36:08 +00008216 return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
Benjamin Peterson29060642009-01-31 22:14:21 +00008217 128*map->count3);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008218}
8219
8220static PyMethodDef encoding_map_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008221 {"size", encoding_map_size, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +00008222 PyDoc_STR("Return the size (in bytes) of this object") },
8223 { 0 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008224};
8225
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008226static PyTypeObject EncodingMapType = {
Benjamin Peterson14339b62009-01-31 16:36:08 +00008227 PyVarObject_HEAD_INIT(NULL, 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008228 "EncodingMap", /*tp_name*/
8229 sizeof(struct encoding_map), /*tp_basicsize*/
8230 0, /*tp_itemsize*/
8231 /* methods */
Inada Naoki7d408692019-05-29 17:23:27 +09008232 0, /*tp_dealloc*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008233 0, /*tp_vectorcall_offset*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008234 0, /*tp_getattr*/
8235 0, /*tp_setattr*/
Jeroen Demeyer530f5062019-05-31 04:13:39 +02008236 0, /*tp_as_async*/
Benjamin Peterson29060642009-01-31 22:14:21 +00008237 0, /*tp_repr*/
8238 0, /*tp_as_number*/
8239 0, /*tp_as_sequence*/
8240 0, /*tp_as_mapping*/
8241 0, /*tp_hash*/
8242 0, /*tp_call*/
8243 0, /*tp_str*/
8244 0, /*tp_getattro*/
8245 0, /*tp_setattro*/
8246 0, /*tp_as_buffer*/
8247 Py_TPFLAGS_DEFAULT, /*tp_flags*/
8248 0, /*tp_doc*/
8249 0, /*tp_traverse*/
8250 0, /*tp_clear*/
8251 0, /*tp_richcompare*/
8252 0, /*tp_weaklistoffset*/
8253 0, /*tp_iter*/
8254 0, /*tp_iternext*/
8255 encoding_map_methods, /*tp_methods*/
8256 0, /*tp_members*/
8257 0, /*tp_getset*/
8258 0, /*tp_base*/
8259 0, /*tp_dict*/
8260 0, /*tp_descr_get*/
8261 0, /*tp_descr_set*/
8262 0, /*tp_dictoffset*/
8263 0, /*tp_init*/
8264 0, /*tp_alloc*/
8265 0, /*tp_new*/
8266 0, /*tp_free*/
8267 0, /*tp_is_gc*/
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008268};
8269
8270PyObject*
8271PyUnicode_BuildEncodingMap(PyObject* string)
8272{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008273 PyObject *result;
8274 struct encoding_map *mresult;
8275 int i;
8276 int need_dict = 0;
8277 unsigned char level1[32];
8278 unsigned char level2[512];
8279 unsigned char *mlevel1, *mlevel2, *mlevel3;
8280 int count2 = 0, count3 = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008281 int kind;
8282 void *data;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008283 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008284 Py_UCS4 ch;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008285
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008286 if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008287 PyErr_BadArgument();
8288 return NULL;
8289 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008290 kind = PyUnicode_KIND(string);
8291 data = PyUnicode_DATA(string);
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008292 length = PyUnicode_GET_LENGTH(string);
8293 length = Py_MIN(length, 256);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008294 memset(level1, 0xFF, sizeof level1);
8295 memset(level2, 0xFF, sizeof level2);
8296
8297 /* If there isn't a one-to-one mapping of NULL to \0,
8298 or if there are non-BMP characters, we need to use
8299 a mapping dictionary. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008300 if (PyUnicode_READ(kind, data, 0) != 0)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008301 need_dict = 1;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008302 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008303 int l1, l2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008304 ch = PyUnicode_READ(kind, data, i);
8305 if (ch == 0 || ch > 0xFFFF) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008306 need_dict = 1;
8307 break;
8308 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008309 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008310 /* unmapped character */
8311 continue;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008312 l1 = ch >> 11;
8313 l2 = ch >> 7;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008314 if (level1[l1] == 0xFF)
8315 level1[l1] = count2++;
8316 if (level2[l2] == 0xFF)
Benjamin Peterson14339b62009-01-31 16:36:08 +00008317 level2[l2] = count3++;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008318 }
8319
8320 if (count2 >= 0xFF || count3 >= 0xFF)
8321 need_dict = 1;
8322
8323 if (need_dict) {
8324 PyObject *result = PyDict_New();
8325 PyObject *key, *value;
8326 if (!result)
8327 return NULL;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008328 for (i = 0; i < length; i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008329 key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
Christian Heimes217cfd12007-12-02 14:31:20 +00008330 value = PyLong_FromLong(i);
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008331 if (!key || !value)
8332 goto failed1;
8333 if (PyDict_SetItem(result, key, value) == -1)
8334 goto failed1;
8335 Py_DECREF(key);
8336 Py_DECREF(value);
8337 }
8338 return result;
8339 failed1:
8340 Py_XDECREF(key);
8341 Py_XDECREF(value);
8342 Py_DECREF(result);
8343 return NULL;
8344 }
8345
8346 /* Create a three-level trie */
8347 result = PyObject_MALLOC(sizeof(struct encoding_map) +
8348 16*count2 + 128*count3 - 1);
8349 if (!result)
8350 return PyErr_NoMemory();
8351 PyObject_Init(result, &EncodingMapType);
8352 mresult = (struct encoding_map*)result;
8353 mresult->count2 = count2;
8354 mresult->count3 = count3;
8355 mlevel1 = mresult->level1;
8356 mlevel2 = mresult->level23;
8357 mlevel3 = mresult->level23 + 16*count2;
8358 memcpy(mlevel1, level1, 32);
8359 memset(mlevel2, 0xFF, 16*count2);
8360 memset(mlevel3, 0, 128*count3);
8361 count3 = 0;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008362 for (i = 1; i < length; i++) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008363 int o1, o2, o3, i2, i3;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008364 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8365 if (ch == 0xFFFE)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008366 /* unmapped character */
8367 continue;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008368 o1 = ch>>11;
8369 o2 = (ch>>7) & 0xF;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008370 i2 = 16*mlevel1[o1] + o2;
8371 if (mlevel2[i2] == 0xFF)
8372 mlevel2[i2] = count3++;
Antoine Pitrouaaefac72012-06-16 22:48:21 +02008373 o3 = ch & 0x7F;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008374 i3 = 128*mlevel2[i2] + o3;
8375 mlevel3[i3] = i;
8376 }
8377 return result;
8378}
8379
8380static int
Victor Stinner22168992011-11-20 17:09:18 +01008381encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008382{
8383 struct encoding_map *map = (struct encoding_map*)mapping;
8384 int l1 = c>>11;
8385 int l2 = (c>>7) & 0xF;
8386 int l3 = c & 0x7F;
8387 int i;
8388
Victor Stinner22168992011-11-20 17:09:18 +01008389 if (c > 0xFFFF)
Benjamin Peterson29060642009-01-31 22:14:21 +00008390 return -1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008391 if (c == 0)
8392 return 0;
8393 /* level 1*/
8394 i = map->level1[l1];
8395 if (i == 0xFF) {
8396 return -1;
8397 }
8398 /* level 2*/
8399 i = map->level23[16*i+l2];
8400 if (i == 0xFF) {
8401 return -1;
8402 }
8403 /* level 3 */
8404 i = map->level23[16*map->count2 + 128*i + l3];
8405 if (i == 0) {
8406 return -1;
8407 }
8408 return i;
8409}
8410
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008411/* Lookup the character ch in the mapping. If the character
8412 can't be found, Py_None is returned (or NULL, if another
Fred Drakedb390c12005-10-28 14:39:47 +00008413 error occurred). */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008414static PyObject *
Victor Stinner22168992011-11-20 17:09:18 +01008415charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008416{
Christian Heimes217cfd12007-12-02 14:31:20 +00008417 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008418 PyObject *x;
8419
8420 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008421 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008422 x = PyObject_GetItem(mapping, w);
8423 Py_DECREF(w);
8424 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008425 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8426 /* No mapping found means: mapping is undefined. */
8427 PyErr_Clear();
Serhiy Storchaka228b12e2017-01-23 09:47:21 +02008428 Py_RETURN_NONE;
Benjamin Peterson29060642009-01-31 22:14:21 +00008429 } else
8430 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008431 }
Walter Dörwaldadc72742003-01-08 22:01:33 +00008432 else if (x == Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00008433 return x;
Christian Heimes217cfd12007-12-02 14:31:20 +00008434 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008435 long value = PyLong_AS_LONG(x);
8436 if (value < 0 || value > 255) {
8437 PyErr_SetString(PyExc_TypeError,
8438 "character mapping must be in range(256)");
8439 Py_DECREF(x);
8440 return NULL;
8441 }
8442 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008443 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008444 else if (PyBytes_Check(x))
Benjamin Peterson29060642009-01-31 22:14:21 +00008445 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008446 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008447 /* wrong return value */
8448 PyErr_Format(PyExc_TypeError,
8449 "character mapping must return integer, bytes or None, not %.400s",
Victor Stinner58ac7002020-02-07 03:04:21 +01008450 Py_TYPE(x)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +00008451 Py_DECREF(x);
8452 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008453 }
8454}
8455
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008456static int
Guido van Rossum98297ee2007-11-06 21:34:58 +00008457charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008458{
Benjamin Peterson14339b62009-01-31 16:36:08 +00008459 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8460 /* exponentially overallocate to minimize reallocations */
8461 if (requiredsize < 2*outsize)
8462 requiredsize = 2*outsize;
8463 if (_PyBytes_Resize(outobj, requiredsize))
8464 return -1;
8465 return 0;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008466}
8467
Benjamin Peterson14339b62009-01-31 16:36:08 +00008468typedef enum charmapencode_result {
Benjamin Peterson29060642009-01-31 22:14:21 +00008469 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
Alexander Belopolsky40018472011-02-26 01:02:56 +00008470} charmapencode_result;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008471/* lookup the character, put the result in the output string and adjust
Walter Dörwald827b0552007-05-12 13:23:53 +00008472 various state variables. Resize the output bytes object if not enough
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008473 space is available. Return a new reference to the object that
8474 was put in the output buffer, or Py_None, if the mapping was undefined
8475 (in which case no character was written) or NULL, if a
Andrew M. Kuchling8294de52005-11-02 16:36:12 +00008476 reallocation error occurred. The caller must decref the result */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008477static charmapencode_result
Victor Stinner22168992011-11-20 17:09:18 +01008478charmapencode_output(Py_UCS4 c, PyObject *mapping,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008479 PyObject **outobj, Py_ssize_t *outpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008480{
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008481 PyObject *rep;
8482 char *outstart;
Christian Heimes72b710a2008-05-26 13:28:38 +00008483 Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008484
Andy Lesterdffe4c02020-03-04 07:15:20 -06008485 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008486 int res = encoding_map_lookup(c, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008487 Py_ssize_t requiredsize = *outpos+1;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008488 if (res == -1)
8489 return enc_FAILED;
Benjamin Peterson29060642009-01-31 22:14:21 +00008490 if (outsize<requiredsize)
8491 if (charmapencode_resize(outobj, outpos, requiredsize))
8492 return enc_EXCEPTION;
Christian Heimes72b710a2008-05-26 13:28:38 +00008493 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008494 outstart[(*outpos)++] = (char)res;
8495 return enc_SUCCESS;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008496 }
8497
8498 rep = charmapencode_lookup(c, mapping);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008499 if (rep==NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008500 return enc_EXCEPTION;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008501 else if (rep==Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008502 Py_DECREF(rep);
8503 return enc_FAILED;
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008504 } else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008505 if (PyLong_Check(rep)) {
8506 Py_ssize_t requiredsize = *outpos+1;
8507 if (outsize<requiredsize)
8508 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8509 Py_DECREF(rep);
8510 return enc_EXCEPTION;
8511 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008512 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008513 outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008514 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008515 else {
8516 const char *repchars = PyBytes_AS_STRING(rep);
8517 Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8518 Py_ssize_t requiredsize = *outpos+repsize;
8519 if (outsize<requiredsize)
8520 if (charmapencode_resize(outobj, outpos, requiredsize)) {
8521 Py_DECREF(rep);
8522 return enc_EXCEPTION;
8523 }
Christian Heimes72b710a2008-05-26 13:28:38 +00008524 outstart = PyBytes_AS_STRING(*outobj);
Benjamin Peterson29060642009-01-31 22:14:21 +00008525 memcpy(outstart + *outpos, repchars, repsize);
8526 *outpos += repsize;
8527 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008528 }
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008529 Py_DECREF(rep);
8530 return enc_SUCCESS;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008531}
8532
8533/* handle an error in PyUnicode_EncodeCharmap
8534 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008535static int
8536charmap_encoding_error(
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008537 PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008538 PyObject **exceptionObject,
Victor Stinner50149202015-09-22 00:26:54 +02008539 _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
Guido van Rossum98297ee2007-11-06 21:34:58 +00008540 PyObject **res, Py_ssize_t *respos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008541{
8542 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008543 Py_ssize_t size, repsize;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008544 Py_ssize_t newpos;
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008545 enum PyUnicode_Kind kind;
8546 void *data;
8547 Py_ssize_t index;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008548 /* startpos for collecting unencodable chars */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008549 Py_ssize_t collstartpos = *inpos;
8550 Py_ssize_t collendpos = *inpos+1;
8551 Py_ssize_t collpos;
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02008552 const char *encoding = "charmap";
8553 const char *reason = "character maps to <undefined>";
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008554 charmapencode_result x;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008555 Py_UCS4 ch;
Brian Curtin2787ea42011-11-02 15:09:37 -05008556 int val;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008557
Benjamin Petersonbac79492012-01-14 13:34:47 -05008558 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008559 return -1;
8560 size = PyUnicode_GET_LENGTH(unicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008561 /* find all unencodable characters */
8562 while (collendpos < size) {
Thomas Wouters73e5a5b2006-06-08 15:35:45 +00008563 PyObject *rep;
Andy Lesterdffe4c02020-03-04 07:15:20 -06008564 if (Py_IS_TYPE(mapping, &EncodingMapType)) {
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008565 ch = PyUnicode_READ_CHAR(unicode, collendpos);
Brian Curtin2787ea42011-11-02 15:09:37 -05008566 val = encoding_map_lookup(ch, mapping);
8567 if (val != -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00008568 break;
8569 ++collendpos;
8570 continue;
8571 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008572
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008573 ch = PyUnicode_READ_CHAR(unicode, collendpos);
8574 rep = charmapencode_lookup(ch, mapping);
Benjamin Peterson29060642009-01-31 22:14:21 +00008575 if (rep==NULL)
8576 return -1;
8577 else if (rep!=Py_None) {
8578 Py_DECREF(rep);
8579 break;
8580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008581 Py_DECREF(rep);
Benjamin Peterson29060642009-01-31 22:14:21 +00008582 ++collendpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008583 }
8584 /* cache callback name lookup
8585 * (if not done yet, i.e. it's the first error) */
Victor Stinner50149202015-09-22 00:26:54 +02008586 if (*error_handler == _Py_ERROR_UNKNOWN)
Victor Stinner3d4226a2018-08-29 22:21:32 +02008587 *error_handler = _Py_GetErrorHandler(errors);
Victor Stinner50149202015-09-22 00:26:54 +02008588
8589 switch (*error_handler) {
8590 case _Py_ERROR_STRICT:
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008591 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008592 return -1;
Victor Stinner50149202015-09-22 00:26:54 +02008593
8594 case _Py_ERROR_REPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008595 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008596 x = charmapencode_output('?', mapping, res, respos);
8597 if (x==enc_EXCEPTION) {
8598 return -1;
8599 }
8600 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008601 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008602 return -1;
8603 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008604 }
8605 /* fall through */
Victor Stinner50149202015-09-22 00:26:54 +02008606 case _Py_ERROR_IGNORE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008607 *inpos = collendpos;
8608 break;
Victor Stinner50149202015-09-22 00:26:54 +02008609
8610 case _Py_ERROR_XMLCHARREFREPLACE:
Benjamin Peterson14339b62009-01-31 16:36:08 +00008611 /* generate replacement (temporarily (mis)uses p) */
8612 for (collpos = collstartpos; collpos < collendpos; ++collpos) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008613 char buffer[2+29+1+1];
8614 char *cp;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008615 sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
Benjamin Peterson29060642009-01-31 22:14:21 +00008616 for (cp = buffer; *cp; ++cp) {
8617 x = charmapencode_output(*cp, mapping, res, respos);
8618 if (x==enc_EXCEPTION)
8619 return -1;
8620 else if (x==enc_FAILED) {
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008621 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008622 return -1;
8623 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008624 }
8625 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008626 *inpos = collendpos;
8627 break;
Victor Stinner50149202015-09-22 00:26:54 +02008628
Benjamin Peterson14339b62009-01-31 16:36:08 +00008629 default:
Victor Stinner50149202015-09-22 00:26:54 +02008630 repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008631 encoding, reason, unicode, exceptionObject,
Benjamin Peterson29060642009-01-31 22:14:21 +00008632 collstartpos, collendpos, &newpos);
Benjamin Peterson14339b62009-01-31 16:36:08 +00008633 if (repunicode == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008634 return -1;
Martin v. Löwis011e8422009-05-05 04:43:17 +00008635 if (PyBytes_Check(repunicode)) {
8636 /* Directly copy bytes result to output. */
8637 Py_ssize_t outsize = PyBytes_Size(*res);
8638 Py_ssize_t requiredsize;
8639 repsize = PyBytes_Size(repunicode);
8640 requiredsize = *respos + repsize;
8641 if (requiredsize > outsize)
8642 /* Make room for all additional bytes. */
8643 if (charmapencode_resize(res, respos, requiredsize)) {
8644 Py_DECREF(repunicode);
8645 return -1;
8646 }
8647 memcpy(PyBytes_AsString(*res) + *respos,
8648 PyBytes_AsString(repunicode), repsize);
8649 *respos += repsize;
8650 *inpos = newpos;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008651 Py_DECREF(repunicode);
Martin v. Löwis011e8422009-05-05 04:43:17 +00008652 break;
Martin v. Löwisdb12d452009-05-02 18:52:14 +00008653 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008654 /* generate replacement */
Benjamin Petersonbac79492012-01-14 13:34:47 -05008655 if (PyUnicode_READY(repunicode) == -1) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008656 Py_DECREF(repunicode);
8657 return -1;
8658 }
Victor Stinner9e30aa52011-11-21 02:49:52 +01008659 repsize = PyUnicode_GET_LENGTH(repunicode);
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008660 data = PyUnicode_DATA(repunicode);
8661 kind = PyUnicode_KIND(repunicode);
8662 for (index = 0; index < repsize; index++) {
8663 Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8664 x = charmapencode_output(repch, mapping, res, respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008665 if (x==enc_EXCEPTION) {
Victor Stinnerae4f7c82011-11-20 18:28:55 +01008666 Py_DECREF(repunicode);
Benjamin Peterson29060642009-01-31 22:14:21 +00008667 return -1;
8668 }
8669 else if (x==enc_FAILED) {
8670 Py_DECREF(repunicode);
Martin v. Löwis12be46c2011-11-04 19:04:15 +01008671 raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
Benjamin Peterson29060642009-01-31 22:14:21 +00008672 return -1;
8673 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008674 }
8675 *inpos = newpos;
8676 Py_DECREF(repunicode);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008677 }
8678 return 0;
8679}
8680
Alexander Belopolsky40018472011-02-26 01:02:56 +00008681PyObject *
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008682_PyUnicode_EncodeCharmap(PyObject *unicode,
8683 PyObject *mapping,
8684 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008685{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008686 /* output object */
8687 PyObject *res = NULL;
8688 /* current input position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008689 Py_ssize_t inpos = 0;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008690 Py_ssize_t size;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008691 /* current output position */
Martin v. Löwis18e16552006-02-15 17:27:45 +00008692 Py_ssize_t respos = 0;
Victor Stinner50149202015-09-22 00:26:54 +02008693 PyObject *error_handler_obj = NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008694 PyObject *exc = NULL;
Victor Stinner50149202015-09-22 00:26:54 +02008695 _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
Victor Stinner69ed0f42013-04-09 21:48:24 +02008696 void *data;
8697 int kind;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008698
Benjamin Petersonbac79492012-01-14 13:34:47 -05008699 if (PyUnicode_READY(unicode) == -1)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008700 return NULL;
8701 size = PyUnicode_GET_LENGTH(unicode);
Victor Stinner69ed0f42013-04-09 21:48:24 +02008702 data = PyUnicode_DATA(unicode);
8703 kind = PyUnicode_KIND(unicode);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008704
Guido van Rossumd57fd912000-03-10 22:53:23 +00008705 /* Default to Latin-1 */
8706 if (mapping == NULL)
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008707 return unicode_encode_ucs1(unicode, errors, 256);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008708
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008709 /* allocate enough for a simple encoding without
8710 replacements, if we need more, we'll resize */
Christian Heimes72b710a2008-05-26 13:28:38 +00008711 res = PyBytes_FromStringAndSize(NULL, size);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008712 if (res == NULL)
8713 goto onError;
Marc-André Lemburgb7520772000-08-14 11:29:19 +00008714 if (size == 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00008715 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008716
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008717 while (inpos<size) {
Victor Stinner69ed0f42013-04-09 21:48:24 +02008718 Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008719 /* try to encode it */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008720 charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008721 if (x==enc_EXCEPTION) /* error */
8722 goto onError;
8723 if (x==enc_FAILED) { /* unencodable character */
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008724 if (charmap_encoding_error(unicode, &inpos, mapping,
Benjamin Peterson29060642009-01-31 22:14:21 +00008725 &exc,
Victor Stinner50149202015-09-22 00:26:54 +02008726 &error_handler, &error_handler_obj, errors,
Benjamin Peterson29060642009-01-31 22:14:21 +00008727 &res, &respos)) {
8728 goto onError;
8729 }
Benjamin Peterson14339b62009-01-31 16:36:08 +00008730 }
Benjamin Peterson29060642009-01-31 22:14:21 +00008731 else
8732 /* done with this character => adjust input position */
8733 ++inpos;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008734 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00008735
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008736 /* Resize if we allocated to much */
Christian Heimes72b710a2008-05-26 13:28:38 +00008737 if (respos<PyBytes_GET_SIZE(res))
Alexandre Vassalotti44531cb2008-12-27 09:16:49 +00008738 if (_PyBytes_Resize(&res, respos) < 0)
8739 goto onError;
Guido van Rossum98297ee2007-11-06 21:34:58 +00008740
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008741 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008742 Py_XDECREF(error_handler_obj);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008743 return res;
8744
Benjamin Peterson29060642009-01-31 22:14:21 +00008745 onError:
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008746 Py_XDECREF(res);
8747 Py_XDECREF(exc);
Victor Stinner50149202015-09-22 00:26:54 +02008748 Py_XDECREF(error_handler_obj);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008749 return NULL;
8750}
8751
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008752/* Deprecated */
8753PyObject *
8754PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8755 Py_ssize_t size,
8756 PyObject *mapping,
8757 const char *errors)
8758{
8759 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02008760 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008761 if (unicode == NULL)
8762 return NULL;
8763 result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8764 Py_DECREF(unicode);
Victor Stinnerfc026c92011-11-04 00:24:51 +01008765 return result;
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008766}
8767
Alexander Belopolsky40018472011-02-26 01:02:56 +00008768PyObject *
8769PyUnicode_AsCharmapString(PyObject *unicode,
8770 PyObject *mapping)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008771{
8772 if (!PyUnicode_Check(unicode) || mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008773 PyErr_BadArgument();
8774 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00008775 }
Martin v. Löwis23e275b2011-11-02 18:02:51 +01008776 return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008777}
8778
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008779/* create or adjust a UnicodeTranslateError */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008780static void
8781make_translate_exception(PyObject **exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008782 PyObject *unicode,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008783 Py_ssize_t startpos, Py_ssize_t endpos,
8784 const char *reason)
Guido van Rossumd57fd912000-03-10 22:53:23 +00008785{
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008786 if (*exceptionObject == NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008787 *exceptionObject = _PyUnicodeTranslateError_Create(
8788 unicode, startpos, endpos, reason);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008789 }
8790 else {
Benjamin Peterson29060642009-01-31 22:14:21 +00008791 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8792 goto onError;
8793 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8794 goto onError;
8795 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8796 goto onError;
8797 return;
8798 onError:
Serhiy Storchaka505ff752014-02-09 13:33:53 +02008799 Py_CLEAR(*exceptionObject);
Guido van Rossumd57fd912000-03-10 22:53:23 +00008800 }
8801}
8802
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008803/* error handling callback helper:
8804 build arguments, call the callback and check the arguments,
8805 put the result into newpos and return the replacement string, which
8806 has to be freed by the caller */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008807static PyObject *
8808unicode_translate_call_errorhandler(const char *errors,
8809 PyObject **errorHandler,
8810 const char *reason,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008811 PyObject *unicode, PyObject **exceptionObject,
Alexander Belopolsky40018472011-02-26 01:02:56 +00008812 Py_ssize_t startpos, Py_ssize_t endpos,
8813 Py_ssize_t *newpos)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008814{
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008815 static const char *argparse = "Un;translating error handler must return (str, int) tuple";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008816
Thomas Wouters49fd7fa2006-04-21 10:40:58 +00008817 Py_ssize_t i_newpos;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008818 PyObject *restuple;
8819 PyObject *resunicode;
8820
8821 if (*errorHandler == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008822 *errorHandler = PyCodec_LookupError(errors);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008823 if (*errorHandler == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008824 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008825 }
8826
8827 make_translate_exception(exceptionObject,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008828 unicode, startpos, endpos, reason);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008829 if (*exceptionObject == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008830 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008831
Petr Viktorinffd97532020-02-11 17:46:57 +01008832 restuple = PyObject_CallOneArg(*errorHandler, *exceptionObject);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008833 if (restuple == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008834 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008835 if (!PyTuple_Check(restuple)) {
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008836 PyErr_SetString(PyExc_TypeError, &argparse[3]);
Benjamin Peterson29060642009-01-31 22:14:21 +00008837 Py_DECREF(restuple);
8838 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008839 }
Serhiy Storchakaf8d7d412016-10-23 15:12:25 +03008840 if (!PyArg_ParseTuple(restuple, argparse,
Benjamin Peterson29060642009-01-31 22:14:21 +00008841 &resunicode, &i_newpos)) {
8842 Py_DECREF(restuple);
8843 return NULL;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008844 }
Martin v. Löwis18e16552006-02-15 17:27:45 +00008845 if (i_newpos<0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008846 *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
Martin v. Löwis18e16552006-02-15 17:27:45 +00008847 else
8848 *newpos = i_newpos;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008849 if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
Victor Stinnera33bce02014-07-04 22:47:46 +02008850 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
Benjamin Peterson29060642009-01-31 22:14:21 +00008851 Py_DECREF(restuple);
8852 return NULL;
Walter Dörwald2e0b18a2003-01-31 17:19:08 +00008853 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008854 Py_INCREF(resunicode);
8855 Py_DECREF(restuple);
8856 return resunicode;
8857}
8858
8859/* Lookup the character ch in the mapping and put the result in result,
8860 which must be decrefed by the caller.
8861 Return 0 on success, -1 on error */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008862static int
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008863charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008864{
Christian Heimes217cfd12007-12-02 14:31:20 +00008865 PyObject *w = PyLong_FromLong((long)c);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008866 PyObject *x;
8867
8868 if (w == NULL)
Benjamin Peterson29060642009-01-31 22:14:21 +00008869 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008870 x = PyObject_GetItem(mapping, w);
8871 Py_DECREF(w);
8872 if (x == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008873 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8874 /* No mapping found means: use 1:1 mapping. */
8875 PyErr_Clear();
8876 *result = NULL;
8877 return 0;
8878 } else
8879 return -1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008880 }
8881 else if (x == Py_None) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008882 *result = x;
8883 return 0;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008884 }
Christian Heimes217cfd12007-12-02 14:31:20 +00008885 else if (PyLong_Check(x)) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008886 long value = PyLong_AS_LONG(x);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008887 if (value < 0 || value > MAX_UNICODE) {
8888 PyErr_Format(PyExc_ValueError,
8889 "character mapping must be in range(0x%x)",
8890 MAX_UNICODE+1);
Benjamin Peterson29060642009-01-31 22:14:21 +00008891 Py_DECREF(x);
8892 return -1;
8893 }
8894 *result = x;
8895 return 0;
8896 }
8897 else if (PyUnicode_Check(x)) {
8898 *result = x;
8899 return 0;
8900 }
8901 else {
8902 /* wrong return value */
8903 PyErr_SetString(PyExc_TypeError,
8904 "character mapping must return integer, None or str");
Benjamin Peterson14339b62009-01-31 16:36:08 +00008905 Py_DECREF(x);
8906 return -1;
8907 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008908}
Victor Stinner1194ea02014-04-04 19:37:40 +02008909
8910/* lookup the character, write the result into the writer.
8911 Return 1 if the result was written into the writer, return 0 if the mapping
8912 was undefined, raise an exception return -1 on error. */
Alexander Belopolsky40018472011-02-26 01:02:56 +00008913static int
Victor Stinner1194ea02014-04-04 19:37:40 +02008914charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8915 _PyUnicodeWriter *writer)
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008916{
Victor Stinner1194ea02014-04-04 19:37:40 +02008917 PyObject *item;
8918
8919 if (charmaptranslate_lookup(ch, mapping, &item))
Benjamin Peterson29060642009-01-31 22:14:21 +00008920 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008921
8922 if (item == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00008923 /* not found => default to 1:1 mapping */
Victor Stinner1194ea02014-04-04 19:37:40 +02008924 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02008925 return -1;
Benjamin Peterson29060642009-01-31 22:14:21 +00008926 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008927 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008928 }
Victor Stinner1194ea02014-04-04 19:37:40 +02008929
8930 if (item == Py_None) {
8931 Py_DECREF(item);
8932 return 0;
8933 }
8934
8935 if (PyLong_Check(item)) {
Victor Stinner4ff33af2014-04-05 11:56:37 +02008936 long ch = (Py_UCS4)PyLong_AS_LONG(item);
8937 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8938 used it */
Victor Stinner1194ea02014-04-04 19:37:40 +02008939 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8940 Py_DECREF(item);
8941 return -1;
8942 }
8943 Py_DECREF(item);
8944 return 1;
8945 }
8946
8947 if (!PyUnicode_Check(item)) {
8948 Py_DECREF(item);
Benjamin Peterson29060642009-01-31 22:14:21 +00008949 return -1;
Victor Stinner1194ea02014-04-04 19:37:40 +02008950 }
8951
8952 if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8953 Py_DECREF(item);
8954 return -1;
8955 }
8956
8957 Py_DECREF(item);
8958 return 1;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00008959}
8960
Victor Stinner89a76ab2014-04-05 11:44:04 +02008961static int
8962unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8963 Py_UCS1 *translate)
8964{
Benjamin Peterson1365de72014-04-07 20:15:41 -04008965 PyObject *item = NULL;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008966 int ret = 0;
8967
Victor Stinner89a76ab2014-04-05 11:44:04 +02008968 if (charmaptranslate_lookup(ch, mapping, &item)) {
8969 return -1;
8970 }
8971
8972 if (item == Py_None) {
Benjamin Peterson1365de72014-04-07 20:15:41 -04008973 /* deletion */
Victor Stinner872b2912014-04-05 14:27:07 +02008974 translate[ch] = 0xfe;
Victor Stinner89a76ab2014-04-05 11:44:04 +02008975 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008976 else if (item == NULL) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008977 /* not found => default to 1:1 mapping */
8978 translate[ch] = ch;
8979 return 1;
8980 }
Benjamin Peterson1365de72014-04-07 20:15:41 -04008981 else if (PyLong_Check(item)) {
Victor Stinner4dd25252014-04-08 09:14:21 +02008982 long replace = PyLong_AS_LONG(item);
Victor Stinner4ff33af2014-04-05 11:56:37 +02008983 /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8984 used it */
8985 if (127 < replace) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02008986 /* invalid character or character outside ASCII:
8987 skip the fast translate */
8988 goto exit;
8989 }
8990 translate[ch] = (Py_UCS1)replace;
8991 }
8992 else if (PyUnicode_Check(item)) {
8993 Py_UCS4 replace;
8994
8995 if (PyUnicode_READY(item) == -1) {
8996 Py_DECREF(item);
8997 return -1;
8998 }
8999 if (PyUnicode_GET_LENGTH(item) != 1)
9000 goto exit;
9001
9002 replace = PyUnicode_READ_CHAR(item, 0);
9003 if (replace > 127)
9004 goto exit;
9005 translate[ch] = (Py_UCS1)replace;
9006 }
9007 else {
Benjamin Peterson1365de72014-04-07 20:15:41 -04009008 /* not None, NULL, long or unicode */
Victor Stinner89a76ab2014-04-05 11:44:04 +02009009 goto exit;
9010 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009011 ret = 1;
9012
Benjamin Peterson1365de72014-04-07 20:15:41 -04009013 exit:
9014 Py_DECREF(item);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009015 return ret;
9016}
9017
9018/* Fast path for ascii => ascii translation. Return 1 if the whole string
9019 was translated into writer, return 0 if the input string was partially
9020 translated into writer, raise an exception and return -1 on error. */
9021static int
9022unicode_fast_translate(PyObject *input, PyObject *mapping,
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009023 _PyUnicodeWriter *writer, int ignore,
9024 Py_ssize_t *input_pos)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009025{
Victor Stinner872b2912014-04-05 14:27:07 +02009026 Py_UCS1 ascii_table[128], ch, ch2;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009027 Py_ssize_t len;
9028 Py_UCS1 *in, *end, *out;
Victor Stinner872b2912014-04-05 14:27:07 +02009029 int res = 0;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009030
Victor Stinner89a76ab2014-04-05 11:44:04 +02009031 len = PyUnicode_GET_LENGTH(input);
9032
Victor Stinner872b2912014-04-05 14:27:07 +02009033 memset(ascii_table, 0xff, 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009034
9035 in = PyUnicode_1BYTE_DATA(input);
9036 end = in + len;
9037
9038 assert(PyUnicode_IS_ASCII(writer->buffer));
9039 assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
9040 out = PyUnicode_1BYTE_DATA(writer->buffer);
9041
Victor Stinner872b2912014-04-05 14:27:07 +02009042 for (; in < end; in++) {
Victor Stinner89a76ab2014-04-05 11:44:04 +02009043 ch = *in;
Victor Stinner872b2912014-04-05 14:27:07 +02009044 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009045 if (ch2 == 0xff) {
Victor Stinner872b2912014-04-05 14:27:07 +02009046 int translate = unicode_fast_translate_lookup(mapping, ch,
9047 ascii_table);
9048 if (translate < 0)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009049 return -1;
Victor Stinner872b2912014-04-05 14:27:07 +02009050 if (translate == 0)
9051 goto exit;
9052 ch2 = ascii_table[ch];
Victor Stinner89a76ab2014-04-05 11:44:04 +02009053 }
Victor Stinner872b2912014-04-05 14:27:07 +02009054 if (ch2 == 0xfe) {
9055 if (ignore)
9056 continue;
9057 goto exit;
9058 }
9059 assert(ch2 < 128);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009060 *out = ch2;
Victor Stinner872b2912014-04-05 14:27:07 +02009061 out++;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009062 }
Victor Stinner872b2912014-04-05 14:27:07 +02009063 res = 1;
9064
9065exit:
9066 writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
Victor Stinner6c9aa8f2016-03-01 21:30:30 +01009067 *input_pos = in - PyUnicode_1BYTE_DATA(input);
Victor Stinner872b2912014-04-05 14:27:07 +02009068 return res;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009069}
9070
Victor Stinner3222da22015-10-01 22:07:32 +02009071static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009072_PyUnicode_TranslateCharmap(PyObject *input,
9073 PyObject *mapping,
9074 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009075{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009076 /* input object */
Victor Stinner1194ea02014-04-04 19:37:40 +02009077 char *data;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009078 Py_ssize_t size, i;
9079 int kind;
9080 /* output buffer */
Victor Stinner1194ea02014-04-04 19:37:40 +02009081 _PyUnicodeWriter writer;
9082 /* error handler */
Serhiy Storchakae2f92de2017-11-11 13:06:26 +02009083 const char *reason = "character maps to <undefined>";
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009084 PyObject *errorHandler = NULL;
9085 PyObject *exc = NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009086 int ignore;
Victor Stinner89a76ab2014-04-05 11:44:04 +02009087 int res;
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009088
Guido van Rossumd57fd912000-03-10 22:53:23 +00009089 if (mapping == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009090 PyErr_BadArgument();
9091 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009092 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009093
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009094 if (PyUnicode_READY(input) == -1)
9095 return NULL;
Victor Stinner1194ea02014-04-04 19:37:40 +02009096 data = (char*)PyUnicode_DATA(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009097 kind = PyUnicode_KIND(input);
9098 size = PyUnicode_GET_LENGTH(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009099
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009100 if (size == 0)
9101 return PyUnicode_FromObject(input);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009102
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009103 /* allocate enough for a simple 1:1 translation without
9104 replacements, if we need more, we'll resize */
Victor Stinner1194ea02014-04-04 19:37:40 +02009105 _PyUnicodeWriter_Init(&writer);
9106 if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +00009107 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009108
Victor Stinner872b2912014-04-05 14:27:07 +02009109 ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
9110
Victor Stinner33798672016-03-01 21:59:58 +01009111 if (PyUnicode_READY(input) == -1)
Victor Stinner89a76ab2014-04-05 11:44:04 +02009112 return NULL;
Victor Stinner33798672016-03-01 21:59:58 +01009113 if (PyUnicode_IS_ASCII(input)) {
9114 res = unicode_fast_translate(input, mapping, &writer, ignore, &i);
9115 if (res < 0) {
9116 _PyUnicodeWriter_Dealloc(&writer);
9117 return NULL;
9118 }
9119 if (res == 1)
9120 return _PyUnicodeWriter_Finish(&writer);
Victor Stinner89a76ab2014-04-05 11:44:04 +02009121 }
Victor Stinner33798672016-03-01 21:59:58 +01009122 else {
9123 i = 0;
9124 }
Victor Stinner89a76ab2014-04-05 11:44:04 +02009125
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009126 while (i<size) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009127 /* try to encode it */
Victor Stinner1194ea02014-04-04 19:37:40 +02009128 int translate;
9129 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
9130 Py_ssize_t newpos;
9131 /* startpos for collecting untranslatable chars */
9132 Py_ssize_t collstart;
9133 Py_ssize_t collend;
Victor Stinner1194ea02014-04-04 19:37:40 +02009134 Py_UCS4 ch;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009135
Victor Stinner1194ea02014-04-04 19:37:40 +02009136 ch = PyUnicode_READ(kind, data, i);
9137 translate = charmaptranslate_output(ch, mapping, &writer);
9138 if (translate < 0)
9139 goto onError;
9140
9141 if (translate != 0) {
9142 /* it worked => adjust input pointer */
9143 ++i;
9144 continue;
9145 }
9146
9147 /* untranslatable character */
9148 collstart = i;
9149 collend = i+1;
9150
9151 /* find all untranslatable characters */
9152 while (collend < size) {
9153 PyObject *x;
9154 ch = PyUnicode_READ(kind, data, collend);
9155 if (charmaptranslate_lookup(ch, mapping, &x))
Benjamin Peterson14339b62009-01-31 16:36:08 +00009156 goto onError;
Victor Stinner1194ea02014-04-04 19:37:40 +02009157 Py_XDECREF(x);
9158 if (x != Py_None)
Benjamin Peterson29060642009-01-31 22:14:21 +00009159 break;
Victor Stinner1194ea02014-04-04 19:37:40 +02009160 ++collend;
9161 }
9162
9163 if (ignore) {
9164 i = collend;
9165 }
9166 else {
9167 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
9168 reason, input, &exc,
9169 collstart, collend, &newpos);
9170 if (repunicode == NULL)
9171 goto onError;
9172 if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009173 Py_DECREF(repunicode);
Victor Stinner1194ea02014-04-04 19:37:40 +02009174 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009175 }
Victor Stinner1194ea02014-04-04 19:37:40 +02009176 Py_DECREF(repunicode);
9177 i = newpos;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009178 }
9179 }
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009180 Py_XDECREF(exc);
9181 Py_XDECREF(errorHandler);
Victor Stinner1194ea02014-04-04 19:37:40 +02009182 return _PyUnicodeWriter_Finish(&writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009183
Benjamin Peterson29060642009-01-31 22:14:21 +00009184 onError:
Victor Stinner1194ea02014-04-04 19:37:40 +02009185 _PyUnicodeWriter_Dealloc(&writer);
Walter Dörwald3aeb6322002-09-02 13:14:32 +00009186 Py_XDECREF(exc);
9187 Py_XDECREF(errorHandler);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009188 return NULL;
9189}
9190
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009191/* Deprecated. Use PyUnicode_Translate instead. */
9192PyObject *
9193PyUnicode_TranslateCharmap(const Py_UNICODE *p,
9194 Py_ssize_t size,
9195 PyObject *mapping,
9196 const char *errors)
9197{
Christian Heimes5f520f42012-09-11 14:03:25 +02009198 PyObject *result;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009199 PyObject *unicode = PyUnicode_FromWideChar(p, size);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009200 if (!unicode)
9201 return NULL;
Christian Heimes5f520f42012-09-11 14:03:25 +02009202 result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
9203 Py_DECREF(unicode);
9204 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009205}
9206
Alexander Belopolsky40018472011-02-26 01:02:56 +00009207PyObject *
9208PyUnicode_Translate(PyObject *str,
9209 PyObject *mapping,
9210 const char *errors)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009211{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009212 if (ensure_unicode(str) < 0)
Christian Heimes5f520f42012-09-11 14:03:25 +02009213 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009214 return _PyUnicode_TranslateCharmap(str, mapping, errors);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009215}
Tim Petersced69f82003-09-16 20:30:58 +00009216
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009217PyObject *
9218_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
9219{
9220 if (!PyUnicode_Check(unicode)) {
9221 PyErr_BadInternalCall();
9222 return NULL;
9223 }
9224 if (PyUnicode_READY(unicode) == -1)
9225 return NULL;
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009226 if (PyUnicode_IS_ASCII(unicode)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009227 /* If the string is already ASCII, just return the same string */
9228 Py_INCREF(unicode);
9229 return unicode;
9230 }
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009231
9232 Py_ssize_t len = PyUnicode_GET_LENGTH(unicode);
9233 PyObject *result = PyUnicode_New(len, 127);
9234 if (result == NULL) {
9235 return NULL;
9236 }
9237
9238 Py_UCS1 *out = PyUnicode_1BYTE_DATA(result);
9239 int kind = PyUnicode_KIND(unicode);
9240 const void *data = PyUnicode_DATA(unicode);
9241 Py_ssize_t i;
9242 for (i = 0; i < len; ++i) {
9243 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
9244 if (ch < 127) {
9245 out[i] = ch;
9246 }
9247 else if (Py_UNICODE_ISSPACE(ch)) {
9248 out[i] = ' ';
9249 }
9250 else {
9251 int decimal = Py_UNICODE_TODECIMAL(ch);
9252 if (decimal < 0) {
9253 out[i] = '?';
INADA Naoki16dfca42018-07-14 12:06:43 +09009254 out[i+1] = '\0';
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009255 _PyUnicode_LENGTH(result) = i + 1;
9256 break;
9257 }
9258 out[i] = '0' + decimal;
9259 }
9260 }
9261
INADA Naoki16dfca42018-07-14 12:06:43 +09009262 assert(_PyUnicode_CheckConsistency(result, 1));
Serhiy Storchaka9b6c60c2017-11-13 21:23:48 +02009263 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009264}
9265
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009266PyObject *
9267PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
9268 Py_ssize_t length)
9269{
Victor Stinnerf0124502011-11-21 23:12:56 +01009270 PyObject *decimal;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009271 Py_ssize_t i;
Victor Stinnerf0124502011-11-21 23:12:56 +01009272 Py_UCS4 maxchar;
9273 enum PyUnicode_Kind kind;
9274 void *data;
9275
Victor Stinner99d7ad02012-02-22 13:37:39 +01009276 maxchar = 127;
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009277 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009278 Py_UCS4 ch = s[i];
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009279 if (ch > 127) {
9280 int decimal = Py_UNICODE_TODECIMAL(ch);
9281 if (decimal >= 0)
Victor Stinnerf0124502011-11-21 23:12:56 +01009282 ch = '0' + decimal;
Benjamin Peterson7e303732013-06-10 09:19:46 -07009283 maxchar = Py_MAX(maxchar, ch);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009284 }
9285 }
Victor Stinnerf0124502011-11-21 23:12:56 +01009286
9287 /* Copy to a new string */
9288 decimal = PyUnicode_New(length, maxchar);
9289 if (decimal == NULL)
9290 return decimal;
9291 kind = PyUnicode_KIND(decimal);
9292 data = PyUnicode_DATA(decimal);
9293 /* Iterate over code points */
9294 for (i = 0; i < length; i++) {
Victor Stinner12174a52014-08-15 23:17:38 +02009295 Py_UCS4 ch = s[i];
Victor Stinnerf0124502011-11-21 23:12:56 +01009296 if (ch > 127) {
9297 int decimal = Py_UNICODE_TODECIMAL(ch);
9298 if (decimal >= 0)
9299 ch = '0' + decimal;
9300 }
9301 PyUnicode_WRITE(kind, data, i, ch);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009302 }
Victor Stinnerd3df8ab2011-11-22 01:22:34 +01009303 return unicode_result(decimal);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +00009304}
Guido van Rossum9e896b32000-04-05 20:11:21 +00009305/* --- Decimal Encoder ---------------------------------------------------- */
9306
Alexander Belopolsky40018472011-02-26 01:02:56 +00009307int
9308PyUnicode_EncodeDecimal(Py_UNICODE *s,
9309 Py_ssize_t length,
9310 char *output,
9311 const char *errors)
Guido van Rossum9e896b32000-04-05 20:11:21 +00009312{
Martin v. Löwis23e275b2011-11-02 18:02:51 +01009313 PyObject *unicode;
Victor Stinner6345be92011-11-25 20:09:01 +01009314 Py_ssize_t i;
Victor Stinner42bf7752011-11-21 22:52:58 +01009315 enum PyUnicode_Kind kind;
9316 void *data;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009317
9318 if (output == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +00009319 PyErr_BadArgument();
9320 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009321 }
9322
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +02009323 unicode = PyUnicode_FromWideChar(s, length);
Victor Stinner42bf7752011-11-21 22:52:58 +01009324 if (unicode == NULL)
9325 return -1;
9326
Victor Stinner42bf7752011-11-21 22:52:58 +01009327 kind = PyUnicode_KIND(unicode);
9328 data = PyUnicode_DATA(unicode);
9329
Victor Stinnerb84d7232011-11-22 01:50:07 +01009330 for (i=0; i < length; ) {
Victor Stinner6345be92011-11-25 20:09:01 +01009331 PyObject *exc;
9332 Py_UCS4 ch;
Benjamin Peterson29060642009-01-31 22:14:21 +00009333 int decimal;
Victor Stinner6345be92011-11-25 20:09:01 +01009334 Py_ssize_t startpos;
9335
9336 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +00009337
Benjamin Peterson29060642009-01-31 22:14:21 +00009338 if (Py_UNICODE_ISSPACE(ch)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +00009339 *output++ = ' ';
Victor Stinnerb84d7232011-11-22 01:50:07 +01009340 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009341 continue;
Benjamin Peterson14339b62009-01-31 16:36:08 +00009342 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009343 decimal = Py_UNICODE_TODECIMAL(ch);
9344 if (decimal >= 0) {
9345 *output++ = '0' + decimal;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009346 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009347 continue;
9348 }
9349 if (0 < ch && ch < 256) {
9350 *output++ = (char)ch;
Victor Stinnerb84d7232011-11-22 01:50:07 +01009351 i++;
Benjamin Peterson29060642009-01-31 22:14:21 +00009352 continue;
9353 }
Victor Stinner6345be92011-11-25 20:09:01 +01009354
Victor Stinner42bf7752011-11-21 22:52:58 +01009355 startpos = i;
Victor Stinner6345be92011-11-25 20:09:01 +01009356 exc = NULL;
9357 raise_encode_exception(&exc, "decimal", unicode,
9358 startpos, startpos+1,
9359 "invalid decimal Unicode string");
9360 Py_XDECREF(exc);
9361 Py_DECREF(unicode);
9362 return -1;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009363 }
9364 /* 0-terminate the output string */
9365 *output++ = '\0';
Victor Stinner42bf7752011-11-21 22:52:58 +01009366 Py_DECREF(unicode);
Guido van Rossum9e896b32000-04-05 20:11:21 +00009367 return 0;
Guido van Rossum9e896b32000-04-05 20:11:21 +00009368}
9369
Guido van Rossumd57fd912000-03-10 22:53:23 +00009370/* --- Helpers ------------------------------------------------------------ */
9371
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009372/* helper macro to fixup start/end slice values */
9373#define ADJUST_INDICES(start, end, len) \
9374 if (end > len) \
9375 end = len; \
9376 else if (end < 0) { \
9377 end += len; \
9378 if (end < 0) \
9379 end = 0; \
9380 } \
9381 if (start < 0) { \
9382 start += len; \
9383 if (start < 0) \
9384 start = 0; \
9385 }
9386
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009387static Py_ssize_t
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009388any_find_slice(PyObject* s1, PyObject* s2,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009389 Py_ssize_t start,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009390 Py_ssize_t end,
9391 int direction)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009392{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009393 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009394 void *buf1, *buf2;
9395 Py_ssize_t len1, len2, result;
9396
9397 kind1 = PyUnicode_KIND(s1);
9398 kind2 = PyUnicode_KIND(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009399 if (kind1 < kind2)
9400 return -1;
9401
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009402 len1 = PyUnicode_GET_LENGTH(s1);
9403 len2 = PyUnicode_GET_LENGTH(s2);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009404 ADJUST_INDICES(start, end, len1);
9405 if (end - start < len2)
9406 return -1;
9407
9408 buf1 = PyUnicode_DATA(s1);
9409 buf2 = PyUnicode_DATA(s2);
9410 if (len2 == 1) {
9411 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9412 result = findchar((const char *)buf1 + kind1*start,
9413 kind1, end - start, ch, direction);
9414 if (result == -1)
9415 return -1;
9416 else
9417 return start + result;
9418 }
9419
9420 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009421 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009422 if (!buf2)
9423 return -2;
9424 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009425
Victor Stinner794d5672011-10-10 03:21:36 +02009426 if (direction > 0) {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009427 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009428 case PyUnicode_1BYTE_KIND:
9429 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9430 result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9431 else
9432 result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9433 break;
9434 case PyUnicode_2BYTE_KIND:
9435 result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9436 break;
9437 case PyUnicode_4BYTE_KIND:
9438 result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9439 break;
9440 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009441 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009442 }
9443 }
9444 else {
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009445 switch (kind1) {
Victor Stinner794d5672011-10-10 03:21:36 +02009446 case PyUnicode_1BYTE_KIND:
9447 if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9448 result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9449 else
9450 result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9451 break;
9452 case PyUnicode_2BYTE_KIND:
9453 result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9454 break;
9455 case PyUnicode_4BYTE_KIND:
9456 result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9457 break;
9458 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009459 Py_UNREACHABLE();
Victor Stinner794d5672011-10-10 03:21:36 +02009460 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009461 }
9462
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009463 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009464 PyMem_Free(buf2);
9465
9466 return result;
9467}
9468
Victor Stinner59423e32018-11-26 13:40:01 +01009469/* _PyUnicode_InsertThousandsGrouping() helper functions */
9470#include "stringlib/localeutil.h"
9471
9472/**
9473 * InsertThousandsGrouping:
9474 * @writer: Unicode writer.
9475 * @n_buffer: Number of characters in @buffer.
9476 * @digits: Digits we're reading from. If count is non-NULL, this is unused.
9477 * @d_pos: Start of digits string.
9478 * @n_digits: The number of digits in the string, in which we want
9479 * to put the grouping chars.
9480 * @min_width: The minimum width of the digits in the output string.
9481 * Output will be zero-padded on the left to fill.
9482 * @grouping: see definition in localeconv().
9483 * @thousands_sep: see definition in localeconv().
9484 *
9485 * There are 2 modes: counting and filling. If @writer is NULL,
9486 * we are in counting mode, else filling mode.
9487 * If counting, the required buffer size is returned.
9488 * If filling, we know the buffer will be large enough, so we don't
9489 * need to pass in the buffer size.
9490 * Inserts thousand grouping characters (as defined by grouping and
9491 * thousands_sep) into @writer.
9492 *
9493 * Return value: -1 on error, number of characters otherwise.
9494 **/
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009495Py_ssize_t
Victor Stinner41a863c2012-02-24 00:37:51 +01009496_PyUnicode_InsertThousandsGrouping(
Victor Stinner59423e32018-11-26 13:40:01 +01009497 _PyUnicodeWriter *writer,
Victor Stinner41a863c2012-02-24 00:37:51 +01009498 Py_ssize_t n_buffer,
Victor Stinner59423e32018-11-26 13:40:01 +01009499 PyObject *digits,
9500 Py_ssize_t d_pos,
9501 Py_ssize_t n_digits,
Victor Stinner41a863c2012-02-24 00:37:51 +01009502 Py_ssize_t min_width,
Victor Stinner59423e32018-11-26 13:40:01 +01009503 const char *grouping,
9504 PyObject *thousands_sep,
Victor Stinner41a863c2012-02-24 00:37:51 +01009505 Py_UCS4 *maxchar)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009506{
Xtreak3f7983a2019-01-07 20:39:14 +05309507 min_width = Py_MAX(0, min_width);
Victor Stinner59423e32018-11-26 13:40:01 +01009508 if (writer) {
9509 assert(digits != NULL);
9510 assert(maxchar == NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009511 }
9512 else {
Victor Stinner59423e32018-11-26 13:40:01 +01009513 assert(digits == NULL);
9514 assert(maxchar != NULL);
Victor Stinner41a863c2012-02-24 00:37:51 +01009515 }
Victor Stinner59423e32018-11-26 13:40:01 +01009516 assert(0 <= d_pos);
9517 assert(0 <= n_digits);
Victor Stinner59423e32018-11-26 13:40:01 +01009518 assert(grouping != NULL);
9519
9520 if (digits != NULL) {
9521 if (PyUnicode_READY(digits) == -1) {
9522 return -1;
Victor Stinner90f50d42012-02-24 01:44:47 +01009523 }
Victor Stinner59423e32018-11-26 13:40:01 +01009524 }
9525 if (PyUnicode_READY(thousands_sep) == -1) {
9526 return -1;
Victor Stinner41a863c2012-02-24 00:37:51 +01009527 }
9528
Victor Stinner59423e32018-11-26 13:40:01 +01009529 Py_ssize_t count = 0;
9530 Py_ssize_t n_zeros;
9531 int loop_broken = 0;
9532 int use_separator = 0; /* First time through, don't append the
9533 separator. They only go between
9534 groups. */
9535 Py_ssize_t buffer_pos;
9536 Py_ssize_t digits_pos;
9537 Py_ssize_t len;
9538 Py_ssize_t n_chars;
9539 Py_ssize_t remaining = n_digits; /* Number of chars remaining to
9540 be looked at */
9541 /* A generator that returns all of the grouping widths, until it
9542 returns 0. */
9543 GroupGenerator groupgen;
9544 GroupGenerator_init(&groupgen, grouping);
9545 const Py_ssize_t thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9546
9547 /* if digits are not grouped, thousands separator
9548 should be an empty string */
9549 assert(!(grouping[0] == CHAR_MAX && thousands_sep_len != 0));
9550
9551 digits_pos = d_pos + n_digits;
9552 if (writer) {
9553 buffer_pos = writer->pos + n_buffer;
9554 assert(buffer_pos <= PyUnicode_GET_LENGTH(writer->buffer));
9555 assert(digits_pos <= PyUnicode_GET_LENGTH(digits));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009556 }
Victor Stinner59423e32018-11-26 13:40:01 +01009557 else {
9558 buffer_pos = n_buffer;
Victor Stinner90f50d42012-02-24 01:44:47 +01009559 }
Victor Stinner59423e32018-11-26 13:40:01 +01009560
9561 if (!writer) {
Victor Stinner41a863c2012-02-24 00:37:51 +01009562 *maxchar = 127;
Victor Stinner41a863c2012-02-24 00:37:51 +01009563 }
Victor Stinner59423e32018-11-26 13:40:01 +01009564
9565 while ((len = GroupGenerator_next(&groupgen)) > 0) {
9566 len = Py_MIN(len, Py_MAX(Py_MAX(remaining, min_width), 1));
9567 n_zeros = Py_MAX(0, len - remaining);
9568 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9569
9570 /* Use n_zero zero's and n_chars chars */
9571
9572 /* Count only, don't do anything. */
9573 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9574
9575 /* Copy into the writer. */
9576 InsertThousandsGrouping_fill(writer, &buffer_pos,
9577 digits, &digits_pos,
9578 n_chars, n_zeros,
9579 use_separator ? thousands_sep : NULL,
9580 thousands_sep_len, maxchar);
9581
9582 /* Use a separator next time. */
9583 use_separator = 1;
9584
9585 remaining -= n_chars;
9586 min_width -= len;
9587
9588 if (remaining <= 0 && min_width <= 0) {
9589 loop_broken = 1;
9590 break;
9591 }
9592 min_width -= thousands_sep_len;
9593 }
9594 if (!loop_broken) {
9595 /* We left the loop without using a break statement. */
9596
9597 len = Py_MAX(Py_MAX(remaining, min_width), 1);
9598 n_zeros = Py_MAX(0, len - remaining);
9599 n_chars = Py_MAX(0, Py_MIN(remaining, len));
9600
9601 /* Use n_zero zero's and n_chars chars */
9602 count += (use_separator ? thousands_sep_len : 0) + n_zeros + n_chars;
9603
9604 /* Copy into the writer. */
9605 InsertThousandsGrouping_fill(writer, &buffer_pos,
9606 digits, &digits_pos,
9607 n_chars, n_zeros,
9608 use_separator ? thousands_sep : NULL,
9609 thousands_sep_len, maxchar);
9610 }
9611 return count;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009612}
9613
9614
Alexander Belopolsky40018472011-02-26 01:02:56 +00009615Py_ssize_t
9616PyUnicode_Count(PyObject *str,
9617 PyObject *substr,
9618 Py_ssize_t start,
9619 Py_ssize_t end)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009620{
Martin v. Löwis18e16552006-02-15 17:27:45 +00009621 Py_ssize_t result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009622 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009623 void *buf1 = NULL, *buf2 = NULL;
9624 Py_ssize_t len1, len2;
Tim Petersced69f82003-09-16 20:30:58 +00009625
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009626 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009627 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009628
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009629 kind1 = PyUnicode_KIND(str);
9630 kind2 = PyUnicode_KIND(substr);
9631 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009632 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009633
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009634 len1 = PyUnicode_GET_LENGTH(str);
9635 len2 = PyUnicode_GET_LENGTH(substr);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009636 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009637 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009638 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009639
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009640 buf1 = PyUnicode_DATA(str);
9641 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009642 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +03009643 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009644 if (!buf2)
9645 goto onError;
9646 }
9647
9648 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009649 case PyUnicode_1BYTE_KIND:
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009650 if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
Victor Stinnerc3cec782011-10-05 21:24:08 +02009651 result = asciilib_count(
9652 ((Py_UCS1*)buf1) + start, end - start,
9653 buf2, len2, PY_SSIZE_T_MAX
9654 );
9655 else
9656 result = ucs1lib_count(
9657 ((Py_UCS1*)buf1) + start, end - start,
9658 buf2, len2, PY_SSIZE_T_MAX
9659 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009660 break;
9661 case PyUnicode_2BYTE_KIND:
9662 result = ucs2lib_count(
9663 ((Py_UCS2*)buf1) + start, end - start,
9664 buf2, len2, PY_SSIZE_T_MAX
9665 );
9666 break;
9667 case PyUnicode_4BYTE_KIND:
9668 result = ucs4lib_count(
9669 ((Py_UCS4*)buf1) + start, end - start,
9670 buf2, len2, PY_SSIZE_T_MAX
9671 );
9672 break;
9673 default:
Barry Warsawb2e57942017-09-14 18:13:16 -07009674 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009675 }
Thomas Wouters477c8d52006-05-27 19:21:47 +00009676
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009677 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009678 PyMem_Free(buf2);
9679
Guido van Rossumd57fd912000-03-10 22:53:23 +00009680 return result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009681 onError:
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009682 if (kind2 != kind1 && buf2)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009683 PyMem_Free(buf2);
9684 return -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009685}
9686
Alexander Belopolsky40018472011-02-26 01:02:56 +00009687Py_ssize_t
9688PyUnicode_Find(PyObject *str,
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009689 PyObject *substr,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009690 Py_ssize_t start,
9691 Py_ssize_t end,
9692 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009693{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009694 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009695 return -2;
Tim Petersced69f82003-09-16 20:30:58 +00009696
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009697 return any_find_slice(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009698}
9699
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009700Py_ssize_t
9701PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9702 Py_ssize_t start, Py_ssize_t end,
9703 int direction)
9704{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009705 int kind;
Xiang Zhangb2110682016-12-20 22:52:33 +08009706 Py_ssize_t len, result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009707 if (PyUnicode_READY(str) == -1)
9708 return -2;
Xiang Zhangb2110682016-12-20 22:52:33 +08009709 len = PyUnicode_GET_LENGTH(str);
9710 ADJUST_INDICES(start, end, len);
9711 if (end - start < 1)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +02009712 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009713 kind = PyUnicode_KIND(str);
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009714 result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9715 kind, end-start, ch, direction);
9716 if (result == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009717 return -1;
Antoine Pitrouf0b934b2011-10-13 18:55:09 +02009718 else
9719 return start + result;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009720}
9721
Alexander Belopolsky40018472011-02-26 01:02:56 +00009722static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +02009723tailmatch(PyObject *self,
9724 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +00009725 Py_ssize_t start,
9726 Py_ssize_t end,
9727 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009728{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009729 int kind_self;
9730 int kind_sub;
9731 void *data_self;
9732 void *data_sub;
9733 Py_ssize_t offset;
9734 Py_ssize_t i;
9735 Py_ssize_t end_sub;
9736
9737 if (PyUnicode_READY(self) == -1 ||
9738 PyUnicode_READY(substring) == -1)
Victor Stinner18aa4472013-01-03 03:18:09 +01009739 return -1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009740
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009741 ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9742 end -= PyUnicode_GET_LENGTH(substring);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009743 if (end < start)
Benjamin Peterson29060642009-01-31 22:14:21 +00009744 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009745
Serhiy Storchakad4ea03c2015-05-31 09:15:51 +03009746 if (PyUnicode_GET_LENGTH(substring) == 0)
9747 return 1;
9748
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009749 kind_self = PyUnicode_KIND(self);
9750 data_self = PyUnicode_DATA(self);
9751 kind_sub = PyUnicode_KIND(substring);
9752 data_sub = PyUnicode_DATA(substring);
9753 end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9754
9755 if (direction > 0)
9756 offset = end;
9757 else
9758 offset = start;
9759
9760 if (PyUnicode_READ(kind_self, data_self, offset) ==
9761 PyUnicode_READ(kind_sub, data_sub, 0) &&
9762 PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9763 PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9764 /* If both are of the same kind, memcmp is sufficient */
9765 if (kind_self == kind_sub) {
9766 return ! memcmp((char *)data_self +
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009767 (offset * PyUnicode_KIND(substring)),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009768 data_sub,
9769 PyUnicode_GET_LENGTH(substring) *
Martin v. Löwisc47adb02011-10-07 20:55:35 +02009770 PyUnicode_KIND(substring));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009771 }
Martin Pantere26da7c2016-06-02 10:07:09 +00009772 /* otherwise we have to compare each character by first accessing it */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009773 else {
9774 /* We do not need to compare 0 and len(substring)-1 because
9775 the if statement above ensured already that they are equal
9776 when we end up here. */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009777 for (i = 1; i < end_sub; ++i) {
9778 if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9779 PyUnicode_READ(kind_sub, data_sub, i))
9780 return 0;
9781 }
Benjamin Peterson29060642009-01-31 22:14:21 +00009782 return 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009783 }
Guido van Rossumd57fd912000-03-10 22:53:23 +00009784 }
9785
9786 return 0;
9787}
9788
Alexander Belopolsky40018472011-02-26 01:02:56 +00009789Py_ssize_t
9790PyUnicode_Tailmatch(PyObject *str,
9791 PyObject *substr,
9792 Py_ssize_t start,
9793 Py_ssize_t end,
9794 int direction)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009795{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009796 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +00009797 return -1;
Tim Petersced69f82003-09-16 20:30:58 +00009798
Serhiy Storchaka21a663e2016-04-13 15:37:23 +03009799 return tailmatch(str, substr, start, end, direction);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009800}
9801
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009802static PyObject *
9803ascii_upper_or_lower(PyObject *self, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009804{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009805 Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9806 char *resdata, *data = PyUnicode_DATA(self);
9807 PyObject *res;
Tim Petersced69f82003-09-16 20:30:58 +00009808
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009809 res = PyUnicode_New(len, 127);
9810 if (res == NULL)
9811 return NULL;
9812 resdata = PyUnicode_DATA(res);
9813 if (lower)
9814 _Py_bytes_lower(resdata, data, len);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009815 else
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009816 _Py_bytes_upper(resdata, data, len);
9817 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009818}
9819
Martin v. Löwisd63a3b82011-09-28 07:41:54 +02009820static Py_UCS4
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009821handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009822{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009823 Py_ssize_t j;
9824 int final_sigma;
Victor Stinner0c39b1b2015-03-18 15:02:06 +01009825 Py_UCS4 c = 0; /* initialize to prevent gcc warning */
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009826 /* U+03A3 is in the Final_Sigma context when, it is found like this:
Tim Petersced69f82003-09-16 20:30:58 +00009827
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009828 \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9829
9830 where ! is a negation and \p{xxx} is a character with property xxx.
9831 */
9832 for (j = i - 1; j >= 0; j--) {
9833 c = PyUnicode_READ(kind, data, j);
9834 if (!_PyUnicode_IsCaseIgnorable(c))
9835 break;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009836 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009837 final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9838 if (final_sigma) {
9839 for (j = i + 1; j < length; j++) {
9840 c = PyUnicode_READ(kind, data, j);
9841 if (!_PyUnicode_IsCaseIgnorable(c))
9842 break;
9843 }
9844 final_sigma = j == length || !_PyUnicode_IsCased(c);
9845 }
9846 return (final_sigma) ? 0x3C2 : 0x3C3;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009847}
9848
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009849static int
9850lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9851 Py_UCS4 c, Py_UCS4 *mapped)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009852{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009853 /* Obscure special case. */
9854 if (c == 0x3A3) {
9855 mapped[0] = handle_capital_sigma(kind, data, length, i);
9856 return 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009857 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009858 return _PyUnicode_ToLowerFull(c, mapped);
Guido van Rossumd57fd912000-03-10 22:53:23 +00009859}
9860
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009861static Py_ssize_t
9862do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009863{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009864 Py_ssize_t i, k = 0;
9865 int n_res, j;
9866 Py_UCS4 c, mapped[3];
Tim Petersced69f82003-09-16 20:30:58 +00009867
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009868 c = PyUnicode_READ(kind, data, 0);
Kingsley Mb015fc82019-04-12 16:35:39 +01009869 n_res = _PyUnicode_ToTitleFull(c, mapped);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009870 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009871 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009872 res[k++] = mapped[j];
Guido van Rossumd57fd912000-03-10 22:53:23 +00009873 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009874 for (i = 1; i < length; i++) {
9875 c = PyUnicode_READ(kind, data, i);
9876 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9877 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009878 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009879 res[k++] = mapped[j];
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009880 }
Marc-André Lemburgfde66e12001-01-29 11:14:16 +00009881 }
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009882 return k;
Guido van Rossumd57fd912000-03-10 22:53:23 +00009883}
9884
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009885static Py_ssize_t
9886do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9887 Py_ssize_t i, k = 0;
9888
9889 for (i = 0; i < length; i++) {
9890 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9891 int n_res, j;
9892 if (Py_UNICODE_ISUPPER(c)) {
9893 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9894 }
9895 else if (Py_UNICODE_ISLOWER(c)) {
9896 n_res = _PyUnicode_ToUpperFull(c, mapped);
9897 }
9898 else {
9899 n_res = 1;
9900 mapped[0] = c;
9901 }
9902 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009903 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009904 res[k++] = mapped[j];
9905 }
9906 }
9907 return k;
9908}
9909
9910static Py_ssize_t
9911do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9912 Py_UCS4 *maxchar, int lower)
Guido van Rossumd57fd912000-03-10 22:53:23 +00009913{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009914 Py_ssize_t i, k = 0;
9915
9916 for (i = 0; i < length; i++) {
9917 Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9918 int n_res, j;
9919 if (lower)
9920 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9921 else
9922 n_res = _PyUnicode_ToUpperFull(c, mapped);
9923 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009924 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009925 res[k++] = mapped[j];
9926 }
9927 }
9928 return k;
9929}
9930
9931static Py_ssize_t
9932do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9933{
9934 return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9935}
9936
9937static Py_ssize_t
9938do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9939{
9940 return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9941}
9942
Benjamin Petersone51757f2012-01-12 21:10:29 -05009943static Py_ssize_t
Benjamin Petersond5890c82012-01-14 13:23:30 -05009944do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9945{
9946 Py_ssize_t i, k = 0;
9947
9948 for (i = 0; i < length; i++) {
9949 Py_UCS4 c = PyUnicode_READ(kind, data, i);
9950 Py_UCS4 mapped[3];
9951 int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9952 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009953 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersond5890c82012-01-14 13:23:30 -05009954 res[k++] = mapped[j];
9955 }
9956 }
9957 return k;
9958}
9959
9960static Py_ssize_t
Benjamin Petersone51757f2012-01-12 21:10:29 -05009961do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9962{
9963 Py_ssize_t i, k = 0;
9964 int previous_is_cased;
9965
9966 previous_is_cased = 0;
9967 for (i = 0; i < length; i++) {
9968 const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9969 Py_UCS4 mapped[3];
9970 int n_res, j;
9971
9972 if (previous_is_cased)
9973 n_res = lower_ucs4(kind, data, length, i, c, mapped);
9974 else
9975 n_res = _PyUnicode_ToTitleFull(c, mapped);
9976
9977 for (j = 0; j < n_res; j++) {
Benjamin Peterson7e303732013-06-10 09:19:46 -07009978 *maxchar = Py_MAX(*maxchar, mapped[j]);
Benjamin Petersone51757f2012-01-12 21:10:29 -05009979 res[k++] = mapped[j];
9980 }
9981
9982 previous_is_cased = _PyUnicode_IsCased(c);
9983 }
9984 return k;
9985}
9986
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009987static PyObject *
9988case_operation(PyObject *self,
9989 Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9990{
9991 PyObject *res = NULL;
9992 Py_ssize_t length, newlength = 0;
9993 int kind, outkind;
9994 void *data, *outdata;
9995 Py_UCS4 maxchar = 0, *tmp, *tmpend;
9996
Benjamin Petersoneea48462012-01-16 14:28:50 -05009997 assert(PyUnicode_IS_READY(self));
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -05009998
9999 kind = PyUnicode_KIND(self);
10000 data = PyUnicode_DATA(self);
10001 length = PyUnicode_GET_LENGTH(self);
Antoine Pitrou4e334242014-10-15 23:14:53 +020010002 if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
Benjamin Petersone1bd38c2014-10-15 11:47:36 -040010003 PyErr_SetString(PyExc_OverflowError, "string is too long");
10004 return NULL;
10005 }
Benjamin Peterson1e211ff2014-10-15 12:17:21 -040010006 tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010007 if (tmp == NULL)
10008 return PyErr_NoMemory();
10009 newlength = perform(kind, data, length, tmp, &maxchar);
10010 res = PyUnicode_New(newlength, maxchar);
10011 if (res == NULL)
10012 goto leave;
10013 tmpend = tmp + newlength;
10014 outdata = PyUnicode_DATA(res);
10015 outkind = PyUnicode_KIND(res);
10016 switch (outkind) {
10017 case PyUnicode_1BYTE_KIND:
10018 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
10019 break;
10020 case PyUnicode_2BYTE_KIND:
10021 _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
10022 break;
10023 case PyUnicode_4BYTE_KIND:
10024 memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
10025 break;
10026 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010027 Py_UNREACHABLE();
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010028 }
10029 leave:
10030 PyMem_FREE(tmp);
10031 return res;
10032}
10033
Tim Peters8ce9f162004-08-27 01:49:32 +000010034PyObject *
10035PyUnicode_Join(PyObject *separator, PyObject *seq)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010036{
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010037 PyObject *res;
10038 PyObject *fseq;
10039 Py_ssize_t seqlen;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010040 PyObject **items;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010041
Benjamin Peterson9743b2c2014-02-15 13:02:52 -050010042 fseq = PySequence_Fast(seq, "can only join an iterable");
Tim Peters05eba1f2004-08-27 21:32:02 +000010043 if (fseq == NULL) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010044 return NULL;
Tim Peters8ce9f162004-08-27 01:49:32 +000010045 }
10046
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010047 /* NOTE: the following code can't call back into Python code,
10048 * so we are sure that fseq won't be mutated.
Tim Peters91879ab2004-08-27 22:35:44 +000010049 */
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010050
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010051 items = PySequence_Fast_ITEMS(fseq);
Tim Peters05eba1f2004-08-27 21:32:02 +000010052 seqlen = PySequence_Fast_GET_SIZE(fseq);
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010053 res = _PyUnicode_JoinArray(separator, items, seqlen);
10054 Py_DECREF(fseq);
10055 return res;
10056}
10057
10058PyObject *
Serhiy Storchakaa5552f02017-12-15 13:11:11 +020010059_PyUnicode_JoinArray(PyObject *separator, PyObject *const *items, Py_ssize_t seqlen)
Serhiy Storchakaea525a22016-09-06 22:07:53 +030010060{
10061 PyObject *res = NULL; /* the result */
10062 PyObject *sep = NULL;
10063 Py_ssize_t seplen;
10064 PyObject *item;
10065 Py_ssize_t sz, i, res_offset;
10066 Py_UCS4 maxchar;
10067 Py_UCS4 item_maxchar;
10068 int use_memcpy;
10069 unsigned char *res_data = NULL, *sep_data = NULL;
10070 PyObject *last_obj;
10071 unsigned int kind = 0;
10072
Tim Peters05eba1f2004-08-27 21:32:02 +000010073 /* If empty sequence, return u"". */
10074 if (seqlen == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010075 _Py_RETURN_UNICODE_EMPTY();
Tim Peters05eba1f2004-08-27 21:32:02 +000010076 }
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020010077
Tim Peters05eba1f2004-08-27 21:32:02 +000010078 /* If singleton sequence with an exact Unicode, return that. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010079 last_obj = NULL;
Victor Stinneracf47b82011-10-06 12:32:37 +020010080 if (seqlen == 1) {
10081 if (PyUnicode_CheckExact(items[0])) {
10082 res = items[0];
10083 Py_INCREF(res);
Victor Stinneracf47b82011-10-06 12:32:37 +020010084 return res;
10085 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010086 seplen = 0;
Victor Stinnerc6f0df72011-10-06 15:58:54 +020010087 maxchar = 0;
Tim Peters8ce9f162004-08-27 01:49:32 +000010088 }
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010089 else {
Victor Stinneracf47b82011-10-06 12:32:37 +020010090 /* Set up sep and seplen */
10091 if (separator == NULL) {
10092 /* fall back to a blank space separator */
10093 sep = PyUnicode_FromOrdinal(' ');
10094 if (!sep)
10095 goto onError;
Victor Stinnerdd077322011-10-07 17:02:31 +020010096 seplen = 1;
Victor Stinneracf47b82011-10-06 12:32:37 +020010097 maxchar = 32;
Tim Peters05eba1f2004-08-27 21:32:02 +000010098 }
Victor Stinneracf47b82011-10-06 12:32:37 +020010099 else {
10100 if (!PyUnicode_Check(separator)) {
10101 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010102 "separator: expected str instance,"
10103 " %.80s found",
10104 Py_TYPE(separator)->tp_name);
Victor Stinneracf47b82011-10-06 12:32:37 +020010105 goto onError;
10106 }
10107 if (PyUnicode_READY(separator))
10108 goto onError;
10109 sep = separator;
10110 seplen = PyUnicode_GET_LENGTH(separator);
10111 maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
10112 /* inc refcount to keep this code path symmetric with the
10113 above case of a blank separator */
10114 Py_INCREF(sep);
10115 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010116 last_obj = sep;
Tim Peters05eba1f2004-08-27 21:32:02 +000010117 }
10118
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010119 /* There are at least two things to join, or else we have a subclass
10120 * of str in the sequence.
10121 * Do a pre-pass to figure out the total amount of space we'll
10122 * need (sz), and see whether all argument are strings.
10123 */
10124 sz = 0;
Victor Stinnerdd077322011-10-07 17:02:31 +020010125#ifdef Py_DEBUG
10126 use_memcpy = 0;
10127#else
10128 use_memcpy = 1;
10129#endif
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010130 for (i = 0; i < seqlen; i++) {
Xiang Zhangb0541f42017-01-10 10:52:00 +080010131 size_t add_sz;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010132 item = items[i];
Benjamin Peterson29060642009-01-31 22:14:21 +000010133 if (!PyUnicode_Check(item)) {
10134 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020010135 "sequence item %zd: expected str instance,"
10136 " %.80s found",
10137 i, Py_TYPE(item)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000010138 goto onError;
10139 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010140 if (PyUnicode_READY(item) == -1)
10141 goto onError;
Xiang Zhangb0541f42017-01-10 10:52:00 +080010142 add_sz = PyUnicode_GET_LENGTH(item);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010143 item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010144 maxchar = Py_MAX(maxchar, item_maxchar);
Xiang Zhangb0541f42017-01-10 10:52:00 +080010145 if (i != 0) {
10146 add_sz += seplen;
10147 }
10148 if (add_sz > (size_t)(PY_SSIZE_T_MAX - sz)) {
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010149 PyErr_SetString(PyExc_OverflowError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010150 "join() result is too long for a Python string");
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010151 goto onError;
10152 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010153 sz += add_sz;
Victor Stinnerdd077322011-10-07 17:02:31 +020010154 if (use_memcpy && last_obj != NULL) {
10155 if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
10156 use_memcpy = 0;
10157 }
10158 last_obj = item;
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010159 }
Tim Petersced69f82003-09-16 20:30:58 +000010160
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010161 res = PyUnicode_New(sz, maxchar);
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010162 if (res == NULL)
10163 goto onError;
Tim Peters91879ab2004-08-27 22:35:44 +000010164
Antoine Pitrouaf14b792008-08-07 21:50:41 +000010165 /* Catenate everything. */
Victor Stinnerdd077322011-10-07 17:02:31 +020010166#ifdef Py_DEBUG
10167 use_memcpy = 0;
10168#else
10169 if (use_memcpy) {
10170 res_data = PyUnicode_1BYTE_DATA(res);
10171 kind = PyUnicode_KIND(res);
10172 if (seplen != 0)
10173 sep_data = PyUnicode_1BYTE_DATA(sep);
10174 }
10175#endif
Victor Stinner4560f9c2013-04-14 18:56:46 +020010176 if (use_memcpy) {
10177 for (i = 0; i < seqlen; ++i) {
10178 Py_ssize_t itemlen;
10179 item = items[i];
10180
10181 /* Copy item, and maybe the separator. */
10182 if (i && seplen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010183 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010184 sep_data,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010185 kind * seplen);
10186 res_data += kind * seplen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010187 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010188
10189 itemlen = PyUnicode_GET_LENGTH(item);
10190 if (itemlen != 0) {
Christian Heimesf051e432016-09-13 20:22:02 +020010191 memcpy(res_data,
Victor Stinnerdd077322011-10-07 17:02:31 +020010192 PyUnicode_DATA(item),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010193 kind * itemlen);
10194 res_data += kind * itemlen;
Victor Stinnerdd077322011-10-07 17:02:31 +020010195 }
Victor Stinner4560f9c2013-04-14 18:56:46 +020010196 }
10197 assert(res_data == PyUnicode_1BYTE_DATA(res)
10198 + kind * PyUnicode_GET_LENGTH(res));
10199 }
10200 else {
10201 for (i = 0, res_offset = 0; i < seqlen; ++i) {
10202 Py_ssize_t itemlen;
10203 item = items[i];
10204
10205 /* Copy item, and maybe the separator. */
10206 if (i && seplen != 0) {
10207 _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
10208 res_offset += seplen;
10209 }
10210
10211 itemlen = PyUnicode_GET_LENGTH(item);
10212 if (itemlen != 0) {
Victor Stinnerd3f08822012-05-29 12:57:52 +020010213 _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
Victor Stinnerdd077322011-10-07 17:02:31 +020010214 res_offset += itemlen;
10215 }
Victor Stinner9ce5a832011-10-03 23:36:02 +020010216 }
Victor Stinnerdd077322011-10-07 17:02:31 +020010217 assert(res_offset == PyUnicode_GET_LENGTH(res));
Victor Stinner4560f9c2013-04-14 18:56:46 +020010218 }
Tim Peters8ce9f162004-08-27 01:49:32 +000010219
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010220 Py_XDECREF(sep);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010221 assert(_PyUnicode_CheckConsistency(res, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010222 return res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010223
Benjamin Peterson29060642009-01-31 22:14:21 +000010224 onError:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010225 Py_XDECREF(sep);
Tim Peters8ce9f162004-08-27 01:49:32 +000010226 Py_XDECREF(res);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010227 return NULL;
10228}
10229
Victor Stinnerd3f08822012-05-29 12:57:52 +020010230void
10231_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10232 Py_UCS4 fill_char)
10233{
10234 const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
Victor Stinner163403a2018-11-27 12:41:17 +010010235 void *data = PyUnicode_DATA(unicode);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010236 assert(PyUnicode_IS_READY(unicode));
10237 assert(unicode_modifiable(unicode));
10238 assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
10239 assert(start >= 0);
10240 assert(start + length <= PyUnicode_GET_LENGTH(unicode));
Victor Stinner59423e32018-11-26 13:40:01 +010010241 unicode_fill(kind, data, fill_char, start, length);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010242}
10243
Victor Stinner3fe55312012-01-04 00:33:50 +010010244Py_ssize_t
10245PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
10246 Py_UCS4 fill_char)
10247{
10248 Py_ssize_t maxlen;
Victor Stinner3fe55312012-01-04 00:33:50 +010010249
10250 if (!PyUnicode_Check(unicode)) {
10251 PyErr_BadInternalCall();
10252 return -1;
10253 }
10254 if (PyUnicode_READY(unicode) == -1)
10255 return -1;
10256 if (unicode_check_modifiable(unicode))
10257 return -1;
10258
Victor Stinnerd3f08822012-05-29 12:57:52 +020010259 if (start < 0) {
10260 PyErr_SetString(PyExc_IndexError, "string index out of range");
10261 return -1;
10262 }
Victor Stinner3fe55312012-01-04 00:33:50 +010010263 if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
10264 PyErr_SetString(PyExc_ValueError,
10265 "fill character is bigger than "
10266 "the string maximum character");
10267 return -1;
10268 }
10269
10270 maxlen = PyUnicode_GET_LENGTH(unicode) - start;
10271 length = Py_MIN(maxlen, length);
10272 if (length <= 0)
10273 return 0;
10274
Victor Stinnerd3f08822012-05-29 12:57:52 +020010275 _PyUnicode_FastFill(unicode, start, length, fill_char);
Victor Stinner3fe55312012-01-04 00:33:50 +010010276 return length;
10277}
10278
Victor Stinner9310abb2011-10-05 00:59:23 +020010279static PyObject *
10280pad(PyObject *self,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010281 Py_ssize_t left,
10282 Py_ssize_t right,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010283 Py_UCS4 fill)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010284{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010285 PyObject *u;
10286 Py_UCS4 maxchar;
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010287 int kind;
10288 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010289
10290 if (left < 0)
10291 left = 0;
10292 if (right < 0)
10293 right = 0;
10294
Victor Stinnerc4b49542011-12-11 22:44:26 +010010295 if (left == 0 && right == 0)
10296 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010297
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010298 if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
10299 right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
Neal Norwitz3ce5d922008-08-24 07:08:55 +000010300 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
10301 return NULL;
10302 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010303 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010304 maxchar = Py_MAX(maxchar, fill);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010305 u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010306 if (!u)
10307 return NULL;
10308
10309 kind = PyUnicode_KIND(u);
10310 data = PyUnicode_DATA(u);
10311 if (left)
Victor Stinner59423e32018-11-26 13:40:01 +010010312 unicode_fill(kind, data, fill, 0, left);
Victor Stinner6c7a52a2011-09-28 21:39:17 +020010313 if (right)
Victor Stinner59423e32018-11-26 13:40:01 +010010314 unicode_fill(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
Victor Stinnerd3f08822012-05-29 12:57:52 +020010315 _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010316 assert(_PyUnicode_CheckConsistency(u, 1));
10317 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010318}
10319
Alexander Belopolsky40018472011-02-26 01:02:56 +000010320PyObject *
10321PyUnicode_Splitlines(PyObject *string, int keepends)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010322{
Guido van Rossumd57fd912000-03-10 22:53:23 +000010323 PyObject *list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010324
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010325 if (ensure_unicode(string) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010326 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010327
Benjamin Petersonead6b532011-12-20 17:23:42 -060010328 switch (PyUnicode_KIND(string)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010329 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010330 if (PyUnicode_IS_ASCII(string))
10331 list = asciilib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010332 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010333 PyUnicode_GET_LENGTH(string), keepends);
10334 else
10335 list = ucs1lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010336 string, PyUnicode_1BYTE_DATA(string),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010337 PyUnicode_GET_LENGTH(string), keepends);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010338 break;
10339 case PyUnicode_2BYTE_KIND:
10340 list = ucs2lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010341 string, PyUnicode_2BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010342 PyUnicode_GET_LENGTH(string), keepends);
10343 break;
10344 case PyUnicode_4BYTE_KIND:
10345 list = ucs4lib_splitlines(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010346 string, PyUnicode_4BYTE_DATA(string),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010347 PyUnicode_GET_LENGTH(string), keepends);
10348 break;
10349 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010350 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010351 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010352 return list;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010353}
10354
Alexander Belopolsky40018472011-02-26 01:02:56 +000010355static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010356split(PyObject *self,
10357 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010358 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010359{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010360 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010361 void *buf1, *buf2;
10362 Py_ssize_t len1, len2;
10363 PyObject* out;
10364
Guido van Rossumd57fd912000-03-10 22:53:23 +000010365 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010366 maxcount = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010367
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010368 if (PyUnicode_READY(self) == -1)
10369 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010370
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010371 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010372 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010373 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010374 if (PyUnicode_IS_ASCII(self))
10375 return asciilib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010376 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010377 PyUnicode_GET_LENGTH(self), maxcount
10378 );
10379 else
10380 return ucs1lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010381 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010382 PyUnicode_GET_LENGTH(self), maxcount
10383 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010384 case PyUnicode_2BYTE_KIND:
10385 return ucs2lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010386 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010387 PyUnicode_GET_LENGTH(self), maxcount
10388 );
10389 case PyUnicode_4BYTE_KIND:
10390 return ucs4lib_split_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010391 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010392 PyUnicode_GET_LENGTH(self), maxcount
10393 );
10394 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010395 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010396 }
10397
10398 if (PyUnicode_READY(substring) == -1)
10399 return NULL;
10400
10401 kind1 = PyUnicode_KIND(self);
10402 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010403 len1 = PyUnicode_GET_LENGTH(self);
10404 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010405 if (kind1 < kind2 || len1 < len2) {
10406 out = PyList_New(1);
10407 if (out == NULL)
10408 return NULL;
10409 Py_INCREF(self);
10410 PyList_SET_ITEM(out, 0, self);
10411 return out;
10412 }
10413 buf1 = PyUnicode_DATA(self);
10414 buf2 = PyUnicode_DATA(substring);
10415 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010416 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010417 if (!buf2)
10418 return NULL;
10419 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010420
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010421 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010422 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010423 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10424 out = asciilib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010425 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010426 else
10427 out = ucs1lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010428 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010429 break;
10430 case PyUnicode_2BYTE_KIND:
10431 out = ucs2lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010432 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010433 break;
10434 case PyUnicode_4BYTE_KIND:
10435 out = ucs4lib_split(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010436 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010437 break;
10438 default:
10439 out = NULL;
10440 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010441 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010442 PyMem_Free(buf2);
10443 return out;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010444}
10445
Alexander Belopolsky40018472011-02-26 01:02:56 +000010446static PyObject *
Victor Stinner9310abb2011-10-05 00:59:23 +020010447rsplit(PyObject *self,
10448 PyObject *substring,
Alexander Belopolsky40018472011-02-26 01:02:56 +000010449 Py_ssize_t maxcount)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010450{
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010451 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010452 void *buf1, *buf2;
10453 Py_ssize_t len1, len2;
10454 PyObject* out;
10455
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010456 if (maxcount < 0)
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000010457 maxcount = PY_SSIZE_T_MAX;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010458
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010459 if (PyUnicode_READY(self) == -1)
10460 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010461
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010462 if (substring == NULL)
Benjamin Petersonead6b532011-12-20 17:23:42 -060010463 switch (PyUnicode_KIND(self)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010464 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010465 if (PyUnicode_IS_ASCII(self))
10466 return asciilib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010467 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010468 PyUnicode_GET_LENGTH(self), maxcount
10469 );
10470 else
10471 return ucs1lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010472 self, PyUnicode_1BYTE_DATA(self),
Victor Stinnerc3cec782011-10-05 21:24:08 +020010473 PyUnicode_GET_LENGTH(self), maxcount
10474 );
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010475 case PyUnicode_2BYTE_KIND:
10476 return ucs2lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010477 self, PyUnicode_2BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010478 PyUnicode_GET_LENGTH(self), maxcount
10479 );
10480 case PyUnicode_4BYTE_KIND:
10481 return ucs4lib_rsplit_whitespace(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010482 self, PyUnicode_4BYTE_DATA(self),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010483 PyUnicode_GET_LENGTH(self), maxcount
10484 );
10485 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070010486 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010487 }
10488
10489 if (PyUnicode_READY(substring) == -1)
10490 return NULL;
10491
10492 kind1 = PyUnicode_KIND(self);
10493 kind2 = PyUnicode_KIND(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010494 len1 = PyUnicode_GET_LENGTH(self);
10495 len2 = PyUnicode_GET_LENGTH(substring);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010496 if (kind1 < kind2 || len1 < len2) {
10497 out = PyList_New(1);
10498 if (out == NULL)
10499 return NULL;
10500 Py_INCREF(self);
10501 PyList_SET_ITEM(out, 0, self);
10502 return out;
10503 }
10504 buf1 = PyUnicode_DATA(self);
10505 buf2 = PyUnicode_DATA(substring);
10506 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010507 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010508 if (!buf2)
10509 return NULL;
10510 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010511
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010512 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010513 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010514 if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10515 out = asciilib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010516 self, buf1, len1, buf2, len2, maxcount);
Victor Stinnerc3cec782011-10-05 21:24:08 +020010517 else
10518 out = ucs1lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010519 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010520 break;
10521 case PyUnicode_2BYTE_KIND:
10522 out = ucs2lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010523 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010524 break;
10525 case PyUnicode_4BYTE_KIND:
10526 out = ucs4lib_rsplit(
Victor Stinner7931d9a2011-11-04 00:22:48 +010010527 self, buf1, len1, buf2, len2, maxcount);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010528 break;
10529 default:
10530 out = NULL;
10531 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010532 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010533 PyMem_Free(buf2);
10534 return out;
10535}
10536
10537static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010538anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10539 PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010540{
Benjamin Petersonead6b532011-12-20 17:23:42 -060010541 switch (kind) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010542 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020010543 if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10544 return asciilib_find(buf1, len1, buf2, len2, offset);
10545 else
10546 return ucs1lib_find(buf1, len1, buf2, len2, offset);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010547 case PyUnicode_2BYTE_KIND:
10548 return ucs2lib_find(buf1, len1, buf2, len2, offset);
10549 case PyUnicode_4BYTE_KIND:
10550 return ucs4lib_find(buf1, len1, buf2, len2, offset);
10551 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010552 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010553}
10554
10555static Py_ssize_t
Victor Stinnerc3cec782011-10-05 21:24:08 +020010556anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10557 PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010558{
Benjamin Petersonc0b95d12011-12-20 17:24:05 -060010559 switch (kind) {
10560 case PyUnicode_1BYTE_KIND:
10561 if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10562 return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10563 else
10564 return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10565 case PyUnicode_2BYTE_KIND:
10566 return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10567 case PyUnicode_4BYTE_KIND:
10568 return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10569 }
Barry Warsawb2e57942017-09-14 18:13:16 -070010570 Py_UNREACHABLE();
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000010571}
10572
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010573static void
10574replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10575 Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10576{
10577 int kind = PyUnicode_KIND(u);
10578 void *data = PyUnicode_DATA(u);
10579 Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10580 if (kind == PyUnicode_1BYTE_KIND) {
10581 ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10582 (Py_UCS1 *)data + len,
10583 u1, u2, maxcount);
10584 }
10585 else if (kind == PyUnicode_2BYTE_KIND) {
10586 ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10587 (Py_UCS2 *)data + len,
10588 u1, u2, maxcount);
10589 }
10590 else {
10591 assert(kind == PyUnicode_4BYTE_KIND);
10592 ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10593 (Py_UCS4 *)data + len,
10594 u1, u2, maxcount);
10595 }
10596}
10597
Alexander Belopolsky40018472011-02-26 01:02:56 +000010598static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010599replace(PyObject *self, PyObject *str1,
10600 PyObject *str2, Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010601{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010602 PyObject *u;
10603 char *sbuf = PyUnicode_DATA(self);
10604 char *buf1 = PyUnicode_DATA(str1);
10605 char *buf2 = PyUnicode_DATA(str2);
10606 int srelease = 0, release1 = 0, release2 = 0;
10607 int skind = PyUnicode_KIND(self);
10608 int kind1 = PyUnicode_KIND(str1);
10609 int kind2 = PyUnicode_KIND(str2);
10610 Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10611 Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10612 Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010613 int mayshrink;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010614 Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010615
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010616 if (slen < len1)
10617 goto nothing;
10618
Guido van Rossumd57fd912000-03-10 22:53:23 +000010619 if (maxcount < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000010620 maxcount = PY_SSIZE_T_MAX;
Serhiy Storchaka865c3b22019-10-30 12:03:53 +020010621 else if (maxcount == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010622 goto nothing;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010623
Victor Stinner59de0ee2011-10-07 10:01:28 +020010624 if (str1 == str2)
10625 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010626
Victor Stinner49a0a212011-10-12 23:46:10 +020010627 maxchar = PyUnicode_MAX_CHAR_VALUE(self);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010628 maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10629 if (maxchar < maxchar_str1)
10630 /* substring too wide to be present */
10631 goto nothing;
Victor Stinner49a0a212011-10-12 23:46:10 +020010632 maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10633 /* Replacing str1 with str2 may cause a maxchar reduction in the
10634 result string. */
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010635 mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
Benjamin Peterson7e303732013-06-10 09:19:46 -070010636 maxchar = Py_MAX(maxchar, maxchar_str2);
Victor Stinner49a0a212011-10-12 23:46:10 +020010637
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010638 if (len1 == len2) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010639 /* same length */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010640 if (len1 == 0)
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010641 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010642 if (len1 == 1) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010643 /* replace characters */
Victor Stinner49a0a212011-10-12 23:46:10 +020010644 Py_UCS4 u1, u2;
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010645 Py_ssize_t pos;
Victor Stinnerf6441102011-12-18 02:43:08 +010010646
Victor Stinner69ed0f42013-04-09 21:48:24 +020010647 u1 = PyUnicode_READ(kind1, buf1, 0);
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010648 pos = findchar(sbuf, skind, slen, u1, 1);
Victor Stinnerf6441102011-12-18 02:43:08 +010010649 if (pos < 0)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010650 goto nothing;
Victor Stinner69ed0f42013-04-09 21:48:24 +020010651 u2 = PyUnicode_READ(kind2, buf2, 0);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010652 u = PyUnicode_New(slen, maxchar);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010653 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010654 goto error;
Victor Stinnerf6441102011-12-18 02:43:08 +010010655
Serhiy Storchakae2cef882013-04-13 22:45:04 +030010656 _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10657 replace_1char_inplace(u, pos, u1, u2, maxcount);
Victor Stinner49a0a212011-10-12 23:46:10 +020010658 }
10659 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010660 int rkind = skind;
10661 char *res;
Victor Stinnerf6441102011-12-18 02:43:08 +010010662 Py_ssize_t i;
Victor Stinner25a4b292011-10-06 12:31:55 +020010663
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010664 if (kind1 < rkind) {
10665 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010666 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010667 if (!buf1) goto error;
10668 release1 = 1;
10669 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010670 i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010671 if (i < 0)
10672 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010673 if (rkind > kind2) {
10674 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010675 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010676 if (!buf2) goto error;
10677 release2 = 1;
10678 }
10679 else if (rkind < kind2) {
10680 /* widen self and buf1 */
10681 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010682 if (release1) {
10683 PyMem_Free(buf1);
10684 buf1 = PyUnicode_DATA(str1);
10685 release1 = 0;
10686 }
10687 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010688 if (!sbuf) goto error;
10689 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010690 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010691 if (!buf1) goto error;
10692 release1 = 1;
10693 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010694 u = PyUnicode_New(slen, maxchar);
10695 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010696 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010697 assert(PyUnicode_KIND(u) == rkind);
10698 res = PyUnicode_DATA(u);
Victor Stinner25a4b292011-10-06 12:31:55 +020010699
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010700 memcpy(res, sbuf, rkind * slen);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010701 /* change everything in-place, starting with this one */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010702 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010703 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010704 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010705 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010706
10707 while ( --maxcount > 0) {
Victor Stinnerc3cec782011-10-05 21:24:08 +020010708 i = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010709 sbuf+rkind*i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010710 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010711 if (i == -1)
10712 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010713 memcpy(res + rkind * i,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010714 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010715 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010716 i += len1;
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010717 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000010718 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010719 }
10720 else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010721 Py_ssize_t n, i, j, ires;
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010722 Py_ssize_t new_size;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010723 int rkind = skind;
10724 char *res;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010725
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010726 if (kind1 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010727 /* widen substring */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010728 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010729 if (!buf1) goto error;
10730 release1 = 1;
10731 }
Victor Stinnerc3cec782011-10-05 21:24:08 +020010732 n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
Thomas Wouters477c8d52006-05-27 19:21:47 +000010733 if (n == 0)
10734 goto nothing;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010735 if (kind2 < rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010736 /* widen replacement */
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010737 buf2 = unicode_askind(kind2, buf2, len2, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010738 if (!buf2) goto error;
10739 release2 = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010740 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010741 else if (kind2 > rkind) {
Victor Stinner49a0a212011-10-12 23:46:10 +020010742 /* widen self and buf1 */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010743 rkind = kind2;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010744 sbuf = unicode_askind(skind, sbuf, slen, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010745 if (!sbuf) goto error;
10746 srelease = 1;
Serhiy Storchaka17b47332020-04-01 15:41:49 +030010747 if (release1) {
10748 PyMem_Free(buf1);
10749 buf1 = PyUnicode_DATA(str1);
10750 release1 = 0;
10751 }
10752 buf1 = unicode_askind(kind1, buf1, len1, rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010753 if (!buf1) goto error;
10754 release1 = 1;
10755 }
10756 /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10757 PyUnicode_GET_LENGTH(str1))); */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020010758 if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010759 PyErr_SetString(PyExc_OverflowError,
10760 "replace string is too long");
10761 goto error;
10762 }
Mark Dickinsonc04ddff2012-10-06 18:04:49 +010010763 new_size = slen + n * (len2 - len1);
Victor Stinner49a0a212011-10-12 23:46:10 +020010764 if (new_size == 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020010765 _Py_INCREF_UNICODE_EMPTY();
10766 if (!unicode_empty)
10767 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010768 u = unicode_empty;
10769 goto done;
10770 }
Xiang Zhangb0541f42017-01-10 10:52:00 +080010771 if (new_size > (PY_SSIZE_T_MAX / rkind)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010772 PyErr_SetString(PyExc_OverflowError,
10773 "replace string is too long");
10774 goto error;
10775 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010776 u = PyUnicode_New(new_size, maxchar);
10777 if (!u)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010778 goto error;
Victor Stinner49a0a212011-10-12 23:46:10 +020010779 assert(PyUnicode_KIND(u) == rkind);
10780 res = PyUnicode_DATA(u);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010781 ires = i = 0;
10782 if (len1 > 0) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010783 while (n-- > 0) {
10784 /* look for next match */
Victor Stinnerc3cec782011-10-05 21:24:08 +020010785 j = anylib_find(rkind, self,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010786 sbuf + rkind * i, slen-i,
Victor Stinnerc3cec782011-10-05 21:24:08 +020010787 str1, buf1, len1, i);
Antoine Pitrouf2c54842010-01-13 08:07:53 +000010788 if (j == -1)
10789 break;
10790 else if (j > i) {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010791 /* copy unchanged part [i:j] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010792 memcpy(res + rkind * ires,
10793 sbuf + rkind * i,
10794 rkind * (j-i));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010795 ires += j - i;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010796 }
10797 /* copy substitution string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010798 if (len2 > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010799 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010800 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010801 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010802 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010803 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010804 i = j + len1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010805 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010806 if (i < slen)
Thomas Wouters477c8d52006-05-27 19:21:47 +000010807 /* copy tail [i:] */
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010808 memcpy(res + rkind * ires,
10809 sbuf + rkind * i,
10810 rkind * (slen-i));
Victor Stinner49a0a212011-10-12 23:46:10 +020010811 }
10812 else {
Thomas Wouters477c8d52006-05-27 19:21:47 +000010813 /* interleave */
10814 while (n > 0) {
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010815 memcpy(res + rkind * ires,
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010816 buf2,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010817 rkind * len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010818 ires += len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010819 if (--n <= 0)
10820 break;
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010821 memcpy(res + rkind * ires,
10822 sbuf + rkind * i,
10823 rkind);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010824 ires++;
10825 i++;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010826 }
Martin v. Löwisc47adb02011-10-07 20:55:35 +020010827 memcpy(res + rkind * ires,
10828 sbuf + rkind * i,
10829 rkind * (slen-i));
Thomas Wouters477c8d52006-05-27 19:21:47 +000010830 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010831 }
10832
10833 if (mayshrink) {
Victor Stinner25a4b292011-10-06 12:31:55 +020010834 unicode_adjust_maxchar(&u);
10835 if (u == NULL)
10836 goto error;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010837 }
Victor Stinner49a0a212011-10-12 23:46:10 +020010838
10839 done:
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010840 if (srelease)
10841 PyMem_FREE(sbuf);
10842 if (release1)
10843 PyMem_FREE(buf1);
10844 if (release2)
10845 PyMem_FREE(buf2);
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020010846 assert(_PyUnicode_CheckConsistency(u, 1));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010847 return u;
Thomas Wouters477c8d52006-05-27 19:21:47 +000010848
Benjamin Peterson29060642009-01-31 22:14:21 +000010849 nothing:
Thomas Wouters477c8d52006-05-27 19:21:47 +000010850 /* nothing to replace; return original string (when possible) */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010851 if (srelease)
10852 PyMem_FREE(sbuf);
10853 if (release1)
10854 PyMem_FREE(buf1);
10855 if (release2)
10856 PyMem_FREE(buf2);
Victor Stinnerc4b49542011-12-11 22:44:26 +010010857 return unicode_result_unchanged(self);
10858
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010859 error:
10860 if (srelease && sbuf)
10861 PyMem_FREE(sbuf);
10862 if (release1 && buf1)
10863 PyMem_FREE(buf1);
10864 if (release2 && buf2)
10865 PyMem_FREE(buf2);
10866 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010867}
10868
10869/* --- Unicode Object Methods --------------------------------------------- */
10870
INADA Naoki3ae20562017-01-16 20:41:20 +090010871/*[clinic input]
10872str.title as unicode_title
Guido van Rossumd57fd912000-03-10 22:53:23 +000010873
INADA Naoki3ae20562017-01-16 20:41:20 +090010874Return a version of the string where each word is titlecased.
10875
10876More specifically, words start with uppercased characters and all remaining
10877cased characters have lower case.
10878[clinic start generated code]*/
10879
10880static PyObject *
10881unicode_title_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010882/*[clinic end generated code: output=c75ae03809574902 input=fa945d669b26e683]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010883{
Benjamin Petersoneea48462012-01-16 14:28:50 -050010884 if (PyUnicode_READY(self) == -1)
10885 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010886 return case_operation(self, do_title);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010887}
10888
INADA Naoki3ae20562017-01-16 20:41:20 +090010889/*[clinic input]
10890str.capitalize as unicode_capitalize
Guido van Rossumd57fd912000-03-10 22:53:23 +000010891
INADA Naoki3ae20562017-01-16 20:41:20 +090010892Return a capitalized version of the string.
10893
10894More specifically, make the first character have upper case and the rest lower
10895case.
10896[clinic start generated code]*/
10897
10898static PyObject *
10899unicode_capitalize_impl(PyObject *self)
10900/*[clinic end generated code: output=e49a4c333cdb7667 input=f4cbf1016938da6d]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010901{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050010902 if (PyUnicode_READY(self) == -1)
10903 return NULL;
10904 if (PyUnicode_GET_LENGTH(self) == 0)
10905 return unicode_result_unchanged(self);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010906 return case_operation(self, do_capitalize);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010907}
10908
INADA Naoki3ae20562017-01-16 20:41:20 +090010909/*[clinic input]
10910str.casefold as unicode_casefold
10911
10912Return a version of the string suitable for caseless comparisons.
10913[clinic start generated code]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010914
10915static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010916unicode_casefold_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090010917/*[clinic end generated code: output=0120daf657ca40af input=384d66cc2ae30daf]*/
Benjamin Petersond5890c82012-01-14 13:23:30 -050010918{
10919 if (PyUnicode_READY(self) == -1)
10920 return NULL;
10921 if (PyUnicode_IS_ASCII(self))
10922 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010010923 return case_operation(self, do_casefold);
Benjamin Petersond5890c82012-01-14 13:23:30 -050010924}
10925
10926
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010927/* Argument converter. Accepts a single Unicode character. */
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010928
10929static int
10930convert_uc(PyObject *obj, void *addr)
10931{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010932 Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010933
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010934 if (!PyUnicode_Check(obj)) {
10935 PyErr_Format(PyExc_TypeError,
10936 "The fill character must be a unicode character, "
Victor Stinner998b8062018-09-12 00:23:25 +020010937 "not %.100s", Py_TYPE(obj)->tp_name);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010938 return 0;
10939 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010940 if (PyUnicode_READY(obj) < 0)
10941 return 0;
10942 if (PyUnicode_GET_LENGTH(obj) != 1) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000010943 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000010944 "The fill character must be exactly one character long");
Benjamin Peterson14339b62009-01-31 16:36:08 +000010945 return 0;
10946 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030010947 *fillcharloc = PyUnicode_READ_CHAR(obj, 0);
Benjamin Peterson14339b62009-01-31 16:36:08 +000010948 return 1;
Raymond Hettinger4f8f9762003-11-26 08:21:35 +000010949}
10950
INADA Naoki3ae20562017-01-16 20:41:20 +090010951/*[clinic input]
10952str.center as unicode_center
10953
10954 width: Py_ssize_t
10955 fillchar: Py_UCS4 = ' '
10956 /
10957
10958Return a centered string of length width.
10959
10960Padding is done using the specified fill character (default is a space).
10961[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010962
10963static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090010964unicode_center_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
10965/*[clinic end generated code: output=420c8859effc7c0c input=b42b247eb26e6519]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000010966{
Martin v. Löwis18e16552006-02-15 17:27:45 +000010967 Py_ssize_t marg, left;
Guido van Rossumd57fd912000-03-10 22:53:23 +000010968
Benjamin Petersonbac79492012-01-14 13:34:47 -050010969 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000010970 return NULL;
10971
Victor Stinnerc4b49542011-12-11 22:44:26 +010010972 if (PyUnicode_GET_LENGTH(self) >= width)
10973 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010974
Victor Stinnerc4b49542011-12-11 22:44:26 +010010975 marg = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010976 left = marg / 2 + (marg & width & 1);
10977
Victor Stinner9310abb2011-10-05 00:59:23 +020010978 return pad(self, left, marg - left, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000010979}
10980
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020010981/* This function assumes that str1 and str2 are readied by the caller. */
10982
Marc-André Lemburge5034372000-08-08 08:04:29 +000010983static int
Victor Stinner9db1a8b2011-10-23 20:04:37 +020010984unicode_compare(PyObject *str1, PyObject *str2)
Marc-André Lemburge5034372000-08-08 08:04:29 +000010985{
Victor Stinnerc1302bb2013-04-08 21:50:54 +020010986#define COMPARE(TYPE1, TYPE2) \
10987 do { \
10988 TYPE1* p1 = (TYPE1 *)data1; \
10989 TYPE2* p2 = (TYPE2 *)data2; \
10990 TYPE1* end = p1 + len; \
10991 Py_UCS4 c1, c2; \
10992 for (; p1 != end; p1++, p2++) { \
10993 c1 = *p1; \
10994 c2 = *p2; \
10995 if (c1 != c2) \
10996 return (c1 < c2) ? -1 : 1; \
10997 } \
10998 } \
10999 while (0)
11000
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011001 int kind1, kind2;
11002 void *data1, *data2;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011003 Py_ssize_t len1, len2, len;
Marc-André Lemburge5034372000-08-08 08:04:29 +000011004
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011005 kind1 = PyUnicode_KIND(str1);
11006 kind2 = PyUnicode_KIND(str2);
11007 data1 = PyUnicode_DATA(str1);
11008 data2 = PyUnicode_DATA(str2);
11009 len1 = PyUnicode_GET_LENGTH(str1);
11010 len2 = PyUnicode_GET_LENGTH(str2);
Victor Stinner770e19e2012-10-04 22:59:45 +020011011 len = Py_MIN(len1, len2);
Marc-André Lemburge5034372000-08-08 08:04:29 +000011012
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011013 switch(kind1) {
11014 case PyUnicode_1BYTE_KIND:
11015 {
11016 switch(kind2) {
11017 case PyUnicode_1BYTE_KIND:
11018 {
11019 int cmp = memcmp(data1, data2, len);
11020 /* normalize result of memcmp() into the range [-1; 1] */
11021 if (cmp < 0)
11022 return -1;
11023 if (cmp > 0)
11024 return 1;
11025 break;
Victor Stinner770e19e2012-10-04 22:59:45 +020011026 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011027 case PyUnicode_2BYTE_KIND:
11028 COMPARE(Py_UCS1, Py_UCS2);
11029 break;
11030 case PyUnicode_4BYTE_KIND:
11031 COMPARE(Py_UCS1, Py_UCS4);
11032 break;
11033 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011034 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011035 }
11036 break;
11037 }
11038 case PyUnicode_2BYTE_KIND:
11039 {
11040 switch(kind2) {
11041 case PyUnicode_1BYTE_KIND:
11042 COMPARE(Py_UCS2, Py_UCS1);
11043 break;
11044 case PyUnicode_2BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011045 {
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011046 COMPARE(Py_UCS2, Py_UCS2);
11047 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011048 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011049 case PyUnicode_4BYTE_KIND:
11050 COMPARE(Py_UCS2, Py_UCS4);
11051 break;
11052 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011053 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011054 }
11055 break;
11056 }
11057 case PyUnicode_4BYTE_KIND:
11058 {
11059 switch(kind2) {
11060 case PyUnicode_1BYTE_KIND:
11061 COMPARE(Py_UCS4, Py_UCS1);
11062 break;
11063 case PyUnicode_2BYTE_KIND:
11064 COMPARE(Py_UCS4, Py_UCS2);
11065 break;
11066 case PyUnicode_4BYTE_KIND:
Victor Stinnercd777ea2013-04-08 22:43:44 +020011067 {
11068#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
11069 int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
11070 /* normalize result of wmemcmp() into the range [-1; 1] */
11071 if (cmp < 0)
11072 return -1;
11073 if (cmp > 0)
11074 return 1;
11075#else
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011076 COMPARE(Py_UCS4, Py_UCS4);
Victor Stinnercd777ea2013-04-08 22:43:44 +020011077#endif
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011078 break;
Victor Stinnercd777ea2013-04-08 22:43:44 +020011079 }
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011080 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011081 Py_UNREACHABLE();
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011082 }
11083 break;
11084 }
11085 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011086 Py_UNREACHABLE();
Marc-André Lemburge5034372000-08-08 08:04:29 +000011087 }
11088
Victor Stinner770e19e2012-10-04 22:59:45 +020011089 if (len1 == len2)
11090 return 0;
11091 if (len1 < len2)
11092 return -1;
11093 else
11094 return 1;
Victor Stinnerc1302bb2013-04-08 21:50:54 +020011095
11096#undef COMPARE
Marc-André Lemburge5034372000-08-08 08:04:29 +000011097}
11098
Benjamin Peterson621b4302016-09-09 13:54:34 -070011099static int
Victor Stinnere5567ad2012-10-23 02:48:49 +020011100unicode_compare_eq(PyObject *str1, PyObject *str2)
11101{
11102 int kind;
11103 void *data1, *data2;
11104 Py_ssize_t len;
11105 int cmp;
11106
Victor Stinnere5567ad2012-10-23 02:48:49 +020011107 len = PyUnicode_GET_LENGTH(str1);
11108 if (PyUnicode_GET_LENGTH(str2) != len)
11109 return 0;
11110 kind = PyUnicode_KIND(str1);
11111 if (PyUnicode_KIND(str2) != kind)
11112 return 0;
11113 data1 = PyUnicode_DATA(str1);
11114 data2 = PyUnicode_DATA(str2);
11115
11116 cmp = memcmp(data1, data2, len * kind);
11117 return (cmp == 0);
11118}
11119
11120
Alexander Belopolsky40018472011-02-26 01:02:56 +000011121int
11122PyUnicode_Compare(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011123{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011124 if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
11125 if (PyUnicode_READY(left) == -1 ||
11126 PyUnicode_READY(right) == -1)
11127 return -1;
Victor Stinnerf0c7b2a2013-11-04 11:27:14 +010011128
11129 /* a string is equal to itself */
11130 if (left == right)
11131 return 0;
11132
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011133 return unicode_compare(left, right);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011134 }
Guido van Rossum09dc34f2007-05-04 04:17:33 +000011135 PyErr_Format(PyExc_TypeError,
11136 "Can't compare %.100s and %.100s",
Victor Stinner58ac7002020-02-07 03:04:21 +010011137 Py_TYPE(left)->tp_name,
11138 Py_TYPE(right)->tp_name);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011139 return -1;
11140}
11141
Martin v. Löwis5b222132007-06-10 09:51:05 +000011142int
11143PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
11144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011145 Py_ssize_t i;
11146 int kind;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011147 Py_UCS4 chr;
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011148 const unsigned char *ustr = (const unsigned char *)str;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011149
Victor Stinner910337b2011-10-03 03:20:16 +020011150 assert(_PyUnicode_CHECK(uni));
Serhiy Storchaka419967b2016-12-06 00:13:34 +020011151 if (!PyUnicode_IS_READY(uni)) {
11152 const wchar_t *ws = _PyUnicode_WSTR(uni);
11153 /* Compare Unicode string and source character set string */
11154 for (i = 0; (chr = ws[i]) && ustr[i]; i++) {
11155 if (chr != ustr[i])
11156 return (chr < ustr[i]) ? -1 : 1;
11157 }
11158 /* This check keeps Python strings that end in '\0' from comparing equal
11159 to C strings identical up to that point. */
11160 if (_PyUnicode_WSTR_LENGTH(uni) != i || chr)
11161 return 1; /* uni is longer */
11162 if (ustr[i])
11163 return -1; /* str is longer */
11164 return 0;
11165 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011166 kind = PyUnicode_KIND(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011167 if (kind == PyUnicode_1BYTE_KIND) {
Victor Stinnera6b9b072013-10-30 18:27:13 +010011168 const void *data = PyUnicode_1BYTE_DATA(uni);
Victor Stinnere1b15922013-11-03 13:53:12 +010011169 size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
Victor Stinner602f7cf2013-10-29 23:31:50 +010011170 size_t len, len2 = strlen(str);
11171 int cmp;
11172
11173 len = Py_MIN(len1, len2);
11174 cmp = memcmp(data, str, len);
Victor Stinner21ea21e2013-11-04 11:28:26 +010011175 if (cmp != 0) {
11176 if (cmp < 0)
11177 return -1;
11178 else
11179 return 1;
11180 }
Victor Stinner602f7cf2013-10-29 23:31:50 +010011181 if (len1 > len2)
11182 return 1; /* uni is longer */
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011183 if (len1 < len2)
Victor Stinner602f7cf2013-10-29 23:31:50 +010011184 return -1; /* str is longer */
11185 return 0;
11186 }
11187 else {
11188 void *data = PyUnicode_DATA(uni);
11189 /* Compare Unicode string and source character set string */
11190 for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
Victor Stinner12174a52014-08-15 23:17:38 +020011191 if (chr != (unsigned char)str[i])
Victor Stinner602f7cf2013-10-29 23:31:50 +010011192 return (chr < (unsigned char)(str[i])) ? -1 : 1;
11193 /* This check keeps Python strings that end in '\0' from comparing equal
11194 to C strings identical up to that point. */
11195 if (PyUnicode_GET_LENGTH(uni) != i || chr)
11196 return 1; /* uni is longer */
11197 if (str[i])
11198 return -1; /* str is longer */
11199 return 0;
11200 }
Martin v. Löwis5b222132007-06-10 09:51:05 +000011201}
11202
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011203static int
11204non_ready_unicode_equal_to_ascii_string(PyObject *unicode, const char *str)
11205{
11206 size_t i, len;
11207 const wchar_t *p;
11208 len = (size_t)_PyUnicode_WSTR_LENGTH(unicode);
11209 if (strlen(str) != len)
11210 return 0;
11211 p = _PyUnicode_WSTR(unicode);
11212 assert(p);
11213 for (i = 0; i < len; i++) {
11214 unsigned char c = (unsigned char)str[i];
Serhiy Storchaka292dd1b2016-11-16 16:12:34 +020011215 if (c >= 128 || p[i] != (wchar_t)c)
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011216 return 0;
11217 }
11218 return 1;
11219}
11220
11221int
11222_PyUnicode_EqualToASCIIString(PyObject *unicode, const char *str)
11223{
11224 size_t len;
11225 assert(_PyUnicode_CHECK(unicode));
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011226 assert(str);
11227#ifndef NDEBUG
11228 for (const char *p = str; *p; p++) {
11229 assert((unsigned char)*p < 128);
11230 }
11231#endif
Serhiy Storchakaf4934ea2016-11-16 10:17:58 +020011232 if (PyUnicode_READY(unicode) == -1) {
11233 /* Memory error or bad data */
11234 PyErr_Clear();
11235 return non_ready_unicode_equal_to_ascii_string(unicode, str);
11236 }
11237 if (!PyUnicode_IS_ASCII(unicode))
11238 return 0;
11239 len = (size_t)PyUnicode_GET_LENGTH(unicode);
11240 return strlen(str) == len &&
11241 memcmp(PyUnicode_1BYTE_DATA(unicode), str, len) == 0;
11242}
11243
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011244int
11245_PyUnicode_EqualToASCIIId(PyObject *left, _Py_Identifier *right)
11246{
11247 PyObject *right_uni;
11248 Py_hash_t hash;
11249
11250 assert(_PyUnicode_CHECK(left));
11251 assert(right->string);
Serhiy Storchakaa83a6a32016-11-16 20:02:44 +020011252#ifndef NDEBUG
11253 for (const char *p = right->string; *p; p++) {
11254 assert((unsigned char)*p < 128);
11255 }
11256#endif
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011257
11258 if (PyUnicode_READY(left) == -1) {
11259 /* memory error or bad data */
11260 PyErr_Clear();
11261 return non_ready_unicode_equal_to_ascii_string(left, right->string);
11262 }
11263
11264 if (!PyUnicode_IS_ASCII(left))
11265 return 0;
11266
11267 right_uni = _PyUnicode_FromId(right); /* borrowed */
11268 if (right_uni == NULL) {
11269 /* memory error or bad data */
11270 PyErr_Clear();
11271 return _PyUnicode_EqualToASCIIString(left, right->string);
11272 }
11273
11274 if (left == right_uni)
11275 return 1;
11276
11277 if (PyUnicode_CHECK_INTERNED(left))
11278 return 0;
11279
INADA Naoki7cc95f52018-01-28 02:07:09 +090011280 assert(_PyUnicode_HASH(right_uni) != -1);
Serhiy Storchakaf5894dd2016-11-16 15:40:39 +020011281 hash = _PyUnicode_HASH(left);
11282 if (hash != -1 && hash != _PyUnicode_HASH(right_uni))
11283 return 0;
11284
11285 return unicode_compare_eq(left, right_uni);
11286}
Antoine Pitrou51f3ef92008-12-20 13:14:23 +000011287
Alexander Belopolsky40018472011-02-26 01:02:56 +000011288PyObject *
11289PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011290{
11291 int result;
Benjamin Peterson14339b62009-01-31 16:36:08 +000011292
Victor Stinnere5567ad2012-10-23 02:48:49 +020011293 if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
11294 Py_RETURN_NOTIMPLEMENTED;
11295
11296 if (PyUnicode_READY(left) == -1 ||
11297 PyUnicode_READY(right) == -1)
11298 return NULL;
11299
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011300 if (left == right) {
11301 switch (op) {
11302 case Py_EQ:
11303 case Py_LE:
11304 case Py_GE:
11305 /* a string is equal to itself */
stratakise8b19652017-11-02 11:32:54 +010011306 Py_RETURN_TRUE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011307 case Py_NE:
11308 case Py_LT:
11309 case Py_GT:
stratakise8b19652017-11-02 11:32:54 +010011310 Py_RETURN_FALSE;
Victor Stinnerfd9e44d2013-11-04 11:23:05 +010011311 default:
11312 PyErr_BadArgument();
11313 return NULL;
11314 }
11315 }
11316 else if (op == Py_EQ || op == Py_NE) {
Victor Stinnere5567ad2012-10-23 02:48:49 +020011317 result = unicode_compare_eq(left, right);
Victor Stinnerc8bc5372013-11-04 11:08:10 +010011318 result ^= (op == Py_NE);
stratakise8b19652017-11-02 11:32:54 +010011319 return PyBool_FromLong(result);
Victor Stinnere5567ad2012-10-23 02:48:49 +020011320 }
11321 else {
Victor Stinner90db9c42012-10-04 21:53:50 +020011322 result = unicode_compare(left, right);
stratakise8b19652017-11-02 11:32:54 +010011323 Py_RETURN_RICHCOMPARE(result, 0, op);
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011324 }
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000011325}
11326
Alexander Belopolsky40018472011-02-26 01:02:56 +000011327int
Raymond Hettingerac2ef652015-07-04 16:04:44 -070011328_PyUnicode_EQ(PyObject *aa, PyObject *bb)
11329{
11330 return unicode_eq(aa, bb);
11331}
11332
11333int
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011334PyUnicode_Contains(PyObject *str, PyObject *substr)
Guido van Rossum403d68b2000-03-13 15:55:09 +000011335{
Victor Stinner77282cb2013-04-14 19:22:47 +020011336 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011337 void *buf1, *buf2;
11338 Py_ssize_t len1, len2;
Martin v. Löwis18e16552006-02-15 17:27:45 +000011339 int result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011340
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011341 if (!PyUnicode_Check(substr)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000011342 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020011343 "'in <string>' requires string as left operand, not %.100s",
11344 Py_TYPE(substr)->tp_name);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011345 return -1;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011346 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011347 if (PyUnicode_READY(substr) == -1)
Thomas Wouters477c8d52006-05-27 19:21:47 +000011348 return -1;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011349 if (ensure_unicode(str) < 0)
11350 return -1;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011351
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011352 kind1 = PyUnicode_KIND(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011353 kind2 = PyUnicode_KIND(substr);
11354 if (kind1 < kind2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011355 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011356 len1 = PyUnicode_GET_LENGTH(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011357 len2 = PyUnicode_GET_LENGTH(substr);
11358 if (len1 < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011359 return 0;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011360 buf1 = PyUnicode_DATA(str);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011361 buf2 = PyUnicode_DATA(substr);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011362 if (len2 == 1) {
11363 Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11364 result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011365 return result;
11366 }
11367 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011368 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011369 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011370 return -1;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011371 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011372
Victor Stinner77282cb2013-04-14 19:22:47 +020011373 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011374 case PyUnicode_1BYTE_KIND:
11375 result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11376 break;
11377 case PyUnicode_2BYTE_KIND:
11378 result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11379 break;
11380 case PyUnicode_4BYTE_KIND:
11381 result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11382 break;
11383 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011384 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011385 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011386
Victor Stinner77282cb2013-04-14 19:22:47 +020011387 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011388 PyMem_Free(buf2);
11389
Guido van Rossum403d68b2000-03-13 15:55:09 +000011390 return result;
Guido van Rossum403d68b2000-03-13 15:55:09 +000011391}
11392
Guido van Rossumd57fd912000-03-10 22:53:23 +000011393/* Concat to string or Unicode object giving a new Unicode object. */
11394
Alexander Belopolsky40018472011-02-26 01:02:56 +000011395PyObject *
11396PyUnicode_Concat(PyObject *left, PyObject *right)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011397{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011398 PyObject *result;
Victor Stinner127226b2011-10-13 01:12:34 +020011399 Py_UCS4 maxchar, maxchar2;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011400 Py_ssize_t left_len, right_len, new_len;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011401
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011402 if (ensure_unicode(left) < 0)
11403 return NULL;
11404
11405 if (!PyUnicode_Check(right)) {
11406 PyErr_Format(PyExc_TypeError,
11407 "can only concatenate str (not \"%.200s\") to str",
Victor Stinner58ac7002020-02-07 03:04:21 +010011408 Py_TYPE(right)->tp_name);
Serhiy Storchaka004e03f2017-03-19 19:38:42 +020011409 return NULL;
11410 }
11411 if (PyUnicode_READY(right) < 0)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011412 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011413
11414 /* Shortcuts */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011415 if (left == unicode_empty)
11416 return PyUnicode_FromObject(right);
11417 if (right == unicode_empty)
11418 return PyUnicode_FromObject(left);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011419
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011420 left_len = PyUnicode_GET_LENGTH(left);
11421 right_len = PyUnicode_GET_LENGTH(right);
11422 if (left_len > PY_SSIZE_T_MAX - right_len) {
Victor Stinner488fa492011-12-12 00:01:39 +010011423 PyErr_SetString(PyExc_OverflowError,
11424 "strings are too large to concat");
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011425 return NULL;
Victor Stinner488fa492011-12-12 00:01:39 +010011426 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011427 new_len = left_len + right_len;
Victor Stinner488fa492011-12-12 00:01:39 +010011428
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011429 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11430 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011431 maxchar = Py_MAX(maxchar, maxchar2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011432
Guido van Rossumd57fd912000-03-10 22:53:23 +000011433 /* Concat the two Unicode strings */
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011434 result = PyUnicode_New(new_len, maxchar);
11435 if (result == NULL)
11436 return NULL;
11437 _PyUnicode_FastCopyCharacters(result, 0, left, 0, left_len);
11438 _PyUnicode_FastCopyCharacters(result, left_len, right, 0, right_len);
11439 assert(_PyUnicode_CheckConsistency(result, 1));
11440 return result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011441}
11442
Walter Dörwald1ab83302007-05-18 17:15:44 +000011443void
Victor Stinner23e56682011-10-03 03:54:37 +020011444PyUnicode_Append(PyObject **p_left, PyObject *right)
Walter Dörwald1ab83302007-05-18 17:15:44 +000011445{
Victor Stinner23e56682011-10-03 03:54:37 +020011446 PyObject *left, *res;
Victor Stinner488fa492011-12-12 00:01:39 +010011447 Py_UCS4 maxchar, maxchar2;
11448 Py_ssize_t left_len, right_len, new_len;
Victor Stinner23e56682011-10-03 03:54:37 +020011449
11450 if (p_left == NULL) {
11451 if (!PyErr_Occurred())
11452 PyErr_BadInternalCall();
Benjamin Peterson14339b62009-01-31 16:36:08 +000011453 return;
11454 }
Victor Stinner23e56682011-10-03 03:54:37 +020011455 left = *p_left;
Victor Stinnerf0335102013-04-14 19:13:03 +020011456 if (right == NULL || left == NULL
11457 || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
Victor Stinner23e56682011-10-03 03:54:37 +020011458 if (!PyErr_Occurred())
11459 PyErr_BadInternalCall();
11460 goto error;
11461 }
11462
Benjamin Petersonbac79492012-01-14 13:34:47 -050011463 if (PyUnicode_READY(left) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011464 goto error;
Benjamin Petersonbac79492012-01-14 13:34:47 -050011465 if (PyUnicode_READY(right) == -1)
Victor Stinnere1335c72011-10-04 20:53:03 +020011466 goto error;
11467
Victor Stinner488fa492011-12-12 00:01:39 +010011468 /* Shortcuts */
11469 if (left == unicode_empty) {
11470 Py_DECREF(left);
11471 Py_INCREF(right);
11472 *p_left = right;
11473 return;
11474 }
11475 if (right == unicode_empty)
11476 return;
11477
11478 left_len = PyUnicode_GET_LENGTH(left);
11479 right_len = PyUnicode_GET_LENGTH(right);
11480 if (left_len > PY_SSIZE_T_MAX - right_len) {
11481 PyErr_SetString(PyExc_OverflowError,
11482 "strings are too large to concat");
11483 goto error;
11484 }
11485 new_len = left_len + right_len;
11486
11487 if (unicode_modifiable(left)
11488 && PyUnicode_CheckExact(right)
11489 && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
Victor Stinnerb0923652011-10-04 01:17:31 +020011490 /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11491 to change the structure size, but characters are stored just after
Georg Brandl7597add2011-10-05 16:36:47 +020011492 the structure, and so it requires to move all characters which is
Victor Stinnerb0923652011-10-04 01:17:31 +020011493 not so different than duplicating the string. */
Victor Stinner488fa492011-12-12 00:01:39 +010011494 && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11495 {
11496 /* append inplace */
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011497 if (unicode_resize(p_left, new_len) != 0)
Victor Stinner488fa492011-12-12 00:01:39 +010011498 goto error;
Victor Stinnerf0335102013-04-14 19:13:03 +020011499
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011500 /* copy 'right' into the newly allocated area of 'left' */
11501 _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
Victor Stinner23e56682011-10-03 03:54:37 +020011502 }
Victor Stinner488fa492011-12-12 00:01:39 +010011503 else {
11504 maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11505 maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
Benjamin Peterson7e303732013-06-10 09:19:46 -070011506 maxchar = Py_MAX(maxchar, maxchar2);
Victor Stinner23e56682011-10-03 03:54:37 +020011507
Victor Stinner488fa492011-12-12 00:01:39 +010011508 /* Concat the two Unicode strings */
11509 res = PyUnicode_New(new_len, maxchar);
11510 if (res == NULL)
11511 goto error;
Victor Stinnerd3f08822012-05-29 12:57:52 +020011512 _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11513 _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
Victor Stinner488fa492011-12-12 00:01:39 +010011514 Py_DECREF(left);
Victor Stinnerbb4503f2013-04-18 09:41:34 +020011515 *p_left = res;
Victor Stinner488fa492011-12-12 00:01:39 +010011516 }
11517 assert(_PyUnicode_CheckConsistency(*p_left, 1));
Victor Stinner23e56682011-10-03 03:54:37 +020011518 return;
11519
11520error:
Victor Stinner488fa492011-12-12 00:01:39 +010011521 Py_CLEAR(*p_left);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011522}
11523
11524void
11525PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11526{
Benjamin Peterson14339b62009-01-31 16:36:08 +000011527 PyUnicode_Append(pleft, right);
11528 Py_XDECREF(right);
Walter Dörwald1ab83302007-05-18 17:15:44 +000011529}
11530
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011531/*
11532Wraps stringlib_parse_args_finds() and additionally ensures that the
11533first argument is a unicode object.
11534*/
11535
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070011536static inline int
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011537parse_args_finds_unicode(const char * function_name, PyObject *args,
11538 PyObject **substring,
11539 Py_ssize_t *start, Py_ssize_t *end)
11540{
11541 if(stringlib_parse_args_finds(function_name, args, substring,
11542 start, end)) {
11543 if (ensure_unicode(*substring) < 0)
11544 return 0;
11545 return 1;
11546 }
11547 return 0;
11548}
11549
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011550PyDoc_STRVAR(count__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011551 "S.count(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011552\n\
Thomas Wouters477c8d52006-05-27 19:21:47 +000011553Return the number of non-overlapping occurrences of substring sub in\n\
Benjamin Peterson142957c2008-07-04 19:55:29 +000011554string S[start:end]. Optional arguments start and end are\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011555interpreted as in slice notation.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011556
11557static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011558unicode_count(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011559{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011560 PyObject *substring = NULL; /* initialize to fix a compiler warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011561 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000011562 Py_ssize_t end = PY_SSIZE_T_MAX;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011563 PyObject *result;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011564 int kind1, kind2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011565 void *buf1, *buf2;
11566 Py_ssize_t len1, len2, iresult;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011567
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011568 if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000011569 return NULL;
Tim Petersced69f82003-09-16 20:30:58 +000011570
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011571 kind1 = PyUnicode_KIND(self);
11572 kind2 = PyUnicode_KIND(substring);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011573 if (kind1 < kind2)
Benjamin Petersonb63f49f2012-05-03 18:31:07 -040011574 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011575
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011576 len1 = PyUnicode_GET_LENGTH(self);
11577 len2 = PyUnicode_GET_LENGTH(substring);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011578 ADJUST_INDICES(start, end, len1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011579 if (end - start < len2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011580 return PyLong_FromLong(0);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011581
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011582 buf1 = PyUnicode_DATA(self);
11583 buf2 = PyUnicode_DATA(substring);
11584 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030011585 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011586 if (!buf2)
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011587 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011588 }
11589 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011590 case PyUnicode_1BYTE_KIND:
11591 iresult = ucs1lib_count(
11592 ((Py_UCS1*)buf1) + start, end - start,
11593 buf2, len2, PY_SSIZE_T_MAX
11594 );
11595 break;
11596 case PyUnicode_2BYTE_KIND:
11597 iresult = ucs2lib_count(
11598 ((Py_UCS2*)buf1) + start, end - start,
11599 buf2, len2, PY_SSIZE_T_MAX
11600 );
11601 break;
11602 case PyUnicode_4BYTE_KIND:
11603 iresult = ucs4lib_count(
11604 ((Py_UCS4*)buf1) + start, end - start,
11605 buf2, len2, PY_SSIZE_T_MAX
11606 );
11607 break;
11608 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070011609 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011610 }
11611
11612 result = PyLong_FromSsize_t(iresult);
11613
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020011614 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011615 PyMem_Free(buf2);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011616
Guido van Rossumd57fd912000-03-10 22:53:23 +000011617 return result;
11618}
11619
INADA Naoki3ae20562017-01-16 20:41:20 +090011620/*[clinic input]
11621str.encode as unicode_encode
11622
11623 encoding: str(c_default="NULL") = 'utf-8'
11624 The encoding in which to encode the string.
11625 errors: str(c_default="NULL") = 'strict'
11626 The error handling scheme to use for encoding errors.
11627 The default is 'strict' meaning that encoding errors raise a
11628 UnicodeEncodeError. Other possible values are 'ignore', 'replace' and
11629 'xmlcharrefreplace' as well as any other name registered with
11630 codecs.register_error that can handle UnicodeEncodeErrors.
11631
11632Encode the string using the codec registered for encoding.
11633[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011634
11635static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090011636unicode_encode_impl(PyObject *self, const char *encoding, const char *errors)
INADA Naoki15f94592017-01-16 21:49:13 +090011637/*[clinic end generated code: output=bf78b6e2a9470e3c input=f0a9eb293d08fe02]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011638{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011639 return PyUnicode_AsEncodedString(self, encoding, errors);
Marc-André Lemburgd2d45982004-07-08 17:57:32 +000011640}
11641
INADA Naoki3ae20562017-01-16 20:41:20 +090011642/*[clinic input]
11643str.expandtabs as unicode_expandtabs
Guido van Rossumd57fd912000-03-10 22:53:23 +000011644
INADA Naoki3ae20562017-01-16 20:41:20 +090011645 tabsize: int = 8
11646
11647Return a copy where all tab characters are expanded using spaces.
11648
11649If tabsize is not given, a tab size of 8 characters is assumed.
11650[clinic start generated code]*/
11651
11652static PyObject *
11653unicode_expandtabs_impl(PyObject *self, int tabsize)
11654/*[clinic end generated code: output=3457c5dcee26928f input=8a01914034af4c85]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011655{
Antoine Pitroue71d5742011-10-04 15:55:09 +020011656 Py_ssize_t i, j, line_pos, src_len, incr;
11657 Py_UCS4 ch;
11658 PyObject *u;
11659 void *src_data, *dest_data;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011660 int kind;
Antoine Pitroue19aa382011-10-04 16:04:01 +020011661 int found;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011662
Antoine Pitrou22425222011-10-04 19:10:51 +020011663 if (PyUnicode_READY(self) == -1)
11664 return NULL;
11665
Thomas Wouters7e474022000-07-16 12:04:32 +000011666 /* First pass: determine size of output string */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011667 src_len = PyUnicode_GET_LENGTH(self);
11668 i = j = line_pos = 0;
11669 kind = PyUnicode_KIND(self);
11670 src_data = PyUnicode_DATA(self);
Antoine Pitroue19aa382011-10-04 16:04:01 +020011671 found = 0;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011672 for (; i < src_len; i++) {
11673 ch = PyUnicode_READ(kind, src_data, i);
11674 if (ch == '\t') {
Antoine Pitroue19aa382011-10-04 16:04:01 +020011675 found = 1;
Benjamin Peterson29060642009-01-31 22:14:21 +000011676 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011677 incr = tabsize - (line_pos % tabsize); /* cannot overflow */
Benjamin Peterson29060642009-01-31 22:14:21 +000011678 if (j > PY_SSIZE_T_MAX - incr)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011679 goto overflow;
11680 line_pos += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011681 j += incr;
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011682 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011683 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011684 else {
Benjamin Peterson29060642009-01-31 22:14:21 +000011685 if (j > PY_SSIZE_T_MAX - 1)
Antoine Pitroue71d5742011-10-04 15:55:09 +020011686 goto overflow;
11687 line_pos++;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011688 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011689 if (ch == '\n' || ch == '\r')
11690 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011691 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011692 }
Victor Stinnerc4b49542011-12-11 22:44:26 +010011693 if (!found)
11694 return unicode_result_unchanged(self);
Guido van Rossumcd16bf62007-06-13 18:07:49 +000011695
Guido van Rossumd57fd912000-03-10 22:53:23 +000011696 /* Second pass: create output string and fill it */
Antoine Pitroue71d5742011-10-04 15:55:09 +020011697 u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011698 if (!u)
11699 return NULL;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011700 dest_data = PyUnicode_DATA(u);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011701
Antoine Pitroue71d5742011-10-04 15:55:09 +020011702 i = j = line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011703
Antoine Pitroue71d5742011-10-04 15:55:09 +020011704 for (; i < src_len; i++) {
11705 ch = PyUnicode_READ(kind, src_data, i);
11706 if (ch == '\t') {
Benjamin Peterson29060642009-01-31 22:14:21 +000011707 if (tabsize > 0) {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011708 incr = tabsize - (line_pos % tabsize);
11709 line_pos += incr;
Victor Stinner59423e32018-11-26 13:40:01 +010011710 unicode_fill(kind, dest_data, ' ', j, incr);
Victor Stinnerda79e632012-02-22 13:37:04 +010011711 j += incr;
Benjamin Peterson29060642009-01-31 22:14:21 +000011712 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000011713 }
Benjamin Peterson29060642009-01-31 22:14:21 +000011714 else {
Antoine Pitroue71d5742011-10-04 15:55:09 +020011715 line_pos++;
11716 PyUnicode_WRITE(kind, dest_data, j, ch);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011717 j++;
Antoine Pitroue71d5742011-10-04 15:55:09 +020011718 if (ch == '\n' || ch == '\r')
11719 line_pos = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011720 }
Antoine Pitroue71d5742011-10-04 15:55:09 +020011721 }
11722 assert (j == PyUnicode_GET_LENGTH(u));
Victor Stinnerd3df8ab2011-11-22 01:22:34 +010011723 return unicode_result(u);
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011724
Antoine Pitroue71d5742011-10-04 15:55:09 +020011725 overflow:
Christian Heimesdd15f6c2008-03-16 00:07:10 +000011726 PyErr_SetString(PyExc_OverflowError, "new string is too long");
11727 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011728}
11729
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011730PyDoc_STRVAR(find__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011731 "S.find(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011732\n\
11733Return the lowest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080011734such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011735arguments start and end are interpreted as in slice notation.\n\
11736\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011737Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011738
11739static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011740unicode_find(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011741{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011742 /* initialize variables to prevent gcc warning */
11743 PyObject *substring = NULL;
11744 Py_ssize_t start = 0;
11745 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000011746 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011747
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011748 if (!parse_args_finds_unicode("find", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011749 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011750
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011751 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011752 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011753
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011754 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011755
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011756 if (result == -2)
11757 return NULL;
11758
Christian Heimes217cfd12007-12-02 14:31:20 +000011759 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011760}
11761
11762static PyObject *
Victor Stinner2fe5ced2011-10-02 00:25:40 +020011763unicode_getitem(PyObject *self, Py_ssize_t index)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011764{
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011765 void *data;
11766 enum PyUnicode_Kind kind;
11767 Py_UCS4 ch;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011768
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011769 if (!PyUnicode_Check(self)) {
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011770 PyErr_BadArgument();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011771 return NULL;
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011772 }
Serhiy Storchakae3b2b4b2017-09-08 09:58:51 +030011773 if (PyUnicode_READY(self) == -1) {
11774 return NULL;
11775 }
Victor Stinnerb6cd0142012-05-03 02:17:04 +020011776 if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11777 PyErr_SetString(PyExc_IndexError, "string index out of range");
11778 return NULL;
11779 }
11780 kind = PyUnicode_KIND(self);
11781 data = PyUnicode_DATA(self);
11782 ch = PyUnicode_READ(kind, data, index);
Victor Stinner985a82a2014-01-03 12:53:47 +010011783 return unicode_char(ch);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011784}
11785
Guido van Rossumc2504932007-09-18 19:42:40 +000011786/* Believe it or not, this produces the same value for ASCII strings
Mark Dickinson57e683e2011-09-24 18:18:40 +010011787 as bytes_hash(). */
Benjamin Peterson8f67d082010-10-17 20:54:53 +000011788static Py_hash_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020011789unicode_hash(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011790{
Gregory P. Smith27cbcd62012-12-10 18:15:46 -080011791 Py_uhash_t x; /* Unsigned for defined overflow behavior. */
Guido van Rossumc2504932007-09-18 19:42:40 +000011792
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011793#ifdef Py_DEBUG
Benjamin Peterson69e97272012-02-21 11:08:50 -050011794 assert(_Py_HashSecret_Initialized);
Benjamin Petersonf6622c82012-04-09 14:53:07 -040011795#endif
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011796 if (_PyUnicode_HASH(self) != -1)
11797 return _PyUnicode_HASH(self);
11798 if (PyUnicode_READY(self) == -1)
11799 return -1;
animalizea1d14252019-01-02 20:16:06 +080011800
Christian Heimes985ecdc2013-11-20 11:46:18 +010011801 x = _Py_HashBytes(PyUnicode_DATA(self),
11802 PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011803 _PyUnicode_HASH(self) = x;
Guido van Rossumc2504932007-09-18 19:42:40 +000011804 return x;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011805}
11806
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000011807PyDoc_STRVAR(index__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000011808 "S.index(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000011809\n\
oldkaa0735f2018-02-02 16:52:55 +080011810Return the lowest index in S where substring sub is found,\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070011811such that sub is contained within S[start:end]. Optional\n\
11812arguments start and end are interpreted as in slice notation.\n\
11813\n\
11814Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000011815
11816static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011817unicode_index(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000011818{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011819 /* initialize variables to prevent gcc warning */
Martin v. Löwis18e16552006-02-15 17:27:45 +000011820 Py_ssize_t result;
Victor Stinner0c39b1b2015-03-18 15:02:06 +010011821 PyObject *substring = NULL;
11822 Py_ssize_t start = 0;
11823 Py_ssize_t end = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011824
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030011825 if (!parse_args_finds_unicode("index", args, &substring, &start, &end))
Guido van Rossumd57fd912000-03-10 22:53:23 +000011826 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011827
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011828 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011829 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011830
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030011831 result = any_find_slice(self, substring, start, end, 1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000011832
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011833 if (result == -2)
11834 return NULL;
11835
Guido van Rossumd57fd912000-03-10 22:53:23 +000011836 if (result < 0) {
11837 PyErr_SetString(PyExc_ValueError, "substring not found");
11838 return NULL;
11839 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000011840
Christian Heimes217cfd12007-12-02 14:31:20 +000011841 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011842}
11843
INADA Naoki3ae20562017-01-16 20:41:20 +090011844/*[clinic input]
INADA Naokia49ac992018-01-27 14:06:21 +090011845str.isascii as unicode_isascii
11846
11847Return True if all characters in the string are ASCII, False otherwise.
11848
11849ASCII characters have code points in the range U+0000-U+007F.
11850Empty string is ASCII too.
11851[clinic start generated code]*/
11852
11853static PyObject *
11854unicode_isascii_impl(PyObject *self)
11855/*[clinic end generated code: output=c5910d64b5a8003f input=5a43cbc6399621d5]*/
11856{
11857 if (PyUnicode_READY(self) == -1) {
11858 return NULL;
11859 }
11860 return PyBool_FromLong(PyUnicode_IS_ASCII(self));
11861}
11862
11863/*[clinic input]
INADA Naoki3ae20562017-01-16 20:41:20 +090011864str.islower as unicode_islower
Guido van Rossumd57fd912000-03-10 22:53:23 +000011865
INADA Naoki3ae20562017-01-16 20:41:20 +090011866Return True if the string is a lowercase string, False otherwise.
11867
11868A string is lowercase if all cased characters in the string are lowercase and
11869there is at least one cased character in the string.
11870[clinic start generated code]*/
11871
11872static PyObject *
11873unicode_islower_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011874/*[clinic end generated code: output=dbd41995bd005b81 input=acec65ac6821ae47]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011875{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011876 Py_ssize_t i, length;
11877 int kind;
11878 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011879 int cased;
11880
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011881 if (PyUnicode_READY(self) == -1)
11882 return NULL;
11883 length = PyUnicode_GET_LENGTH(self);
11884 kind = PyUnicode_KIND(self);
11885 data = PyUnicode_DATA(self);
11886
Guido van Rossumd57fd912000-03-10 22:53:23 +000011887 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011888 if (length == 1)
11889 return PyBool_FromLong(
11890 Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000011891
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011892 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011893 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011894 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011895
Guido van Rossumd57fd912000-03-10 22:53:23 +000011896 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011897 for (i = 0; i < length; i++) {
11898 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011899
Benjamin Peterson29060642009-01-31 22:14:21 +000011900 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011901 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011902 else if (!cased && Py_UNICODE_ISLOWER(ch))
11903 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011904 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011905 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011906}
11907
INADA Naoki3ae20562017-01-16 20:41:20 +090011908/*[clinic input]
11909str.isupper as unicode_isupper
Guido van Rossumd57fd912000-03-10 22:53:23 +000011910
INADA Naoki3ae20562017-01-16 20:41:20 +090011911Return True if the string is an uppercase string, False otherwise.
11912
11913A string is uppercase if all cased characters in the string are uppercase and
11914there is at least one cased character in the string.
11915[clinic start generated code]*/
11916
11917static PyObject *
11918unicode_isupper_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011919/*[clinic end generated code: output=049209c8e7f15f59 input=e9b1feda5d17f2d3]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011920{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011921 Py_ssize_t i, length;
11922 int kind;
11923 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011924 int cased;
11925
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011926 if (PyUnicode_READY(self) == -1)
11927 return NULL;
11928 length = PyUnicode_GET_LENGTH(self);
11929 kind = PyUnicode_KIND(self);
11930 data = PyUnicode_DATA(self);
11931
Guido van Rossumd57fd912000-03-10 22:53:23 +000011932 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011933 if (length == 1)
11934 return PyBool_FromLong(
11935 Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011936
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011937 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011938 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011939 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011940
Guido van Rossumd57fd912000-03-10 22:53:23 +000011941 cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011942 for (i = 0; i < length; i++) {
11943 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011944
Benjamin Peterson29060642009-01-31 22:14:21 +000011945 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011946 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011947 else if (!cased && Py_UNICODE_ISUPPER(ch))
11948 cased = 1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011949 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000011950 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000011951}
11952
INADA Naoki3ae20562017-01-16 20:41:20 +090011953/*[clinic input]
11954str.istitle as unicode_istitle
Guido van Rossumd57fd912000-03-10 22:53:23 +000011955
INADA Naoki3ae20562017-01-16 20:41:20 +090011956Return True if the string is a title-cased string, False otherwise.
11957
11958In a title-cased string, upper- and title-case characters may only
11959follow uncased characters and lowercase characters only cased ones.
11960[clinic start generated code]*/
11961
11962static PyObject *
11963unicode_istitle_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090011964/*[clinic end generated code: output=e9bf6eb91f5d3f0e input=98d32bd2e1f06f8c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000011965{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011966 Py_ssize_t i, length;
11967 int kind;
11968 void *data;
Guido van Rossumd57fd912000-03-10 22:53:23 +000011969 int cased, previous_is_cased;
11970
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011971 if (PyUnicode_READY(self) == -1)
11972 return NULL;
11973 length = PyUnicode_GET_LENGTH(self);
11974 kind = PyUnicode_KIND(self);
11975 data = PyUnicode_DATA(self);
11976
Guido van Rossumd57fd912000-03-10 22:53:23 +000011977 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011978 if (length == 1) {
11979 Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11980 return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11981 (Py_UNICODE_ISUPPER(ch) != 0));
11982 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000011983
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011984 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011985 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011986 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000011987
Guido van Rossumd57fd912000-03-10 22:53:23 +000011988 cased = 0;
11989 previous_is_cased = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020011990 for (i = 0; i < length; i++) {
11991 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Tim Petersced69f82003-09-16 20:30:58 +000011992
Benjamin Peterson29060642009-01-31 22:14:21 +000011993 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11994 if (previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020011995 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000011996 previous_is_cased = 1;
11997 cased = 1;
11998 }
11999 else if (Py_UNICODE_ISLOWER(ch)) {
12000 if (!previous_is_cased)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012001 Py_RETURN_FALSE;
Benjamin Peterson29060642009-01-31 22:14:21 +000012002 previous_is_cased = 1;
12003 cased = 1;
12004 }
12005 else
12006 previous_is_cased = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012007 }
Guido van Rossum77f6a652002-04-03 22:41:51 +000012008 return PyBool_FromLong(cased);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012009}
12010
INADA Naoki3ae20562017-01-16 20:41:20 +090012011/*[clinic input]
12012str.isspace as unicode_isspace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012013
INADA Naoki3ae20562017-01-16 20:41:20 +090012014Return True if the string is a whitespace string, False otherwise.
12015
12016A string is whitespace if all characters in the string are whitespace and there
12017is at least one character in the string.
12018[clinic start generated code]*/
12019
12020static PyObject *
12021unicode_isspace_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012022/*[clinic end generated code: output=163a63bfa08ac2b9 input=fe462cb74f8437d8]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012023{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012024 Py_ssize_t i, length;
12025 int kind;
12026 void *data;
12027
12028 if (PyUnicode_READY(self) == -1)
12029 return NULL;
12030 length = PyUnicode_GET_LENGTH(self);
12031 kind = PyUnicode_KIND(self);
12032 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012033
Guido van Rossumd57fd912000-03-10 22:53:23 +000012034 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012035 if (length == 1)
12036 return PyBool_FromLong(
12037 Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012038
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012039 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012040 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012041 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012042
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012043 for (i = 0; i < length; i++) {
12044 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012045 if (!Py_UNICODE_ISSPACE(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012046 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012047 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012048 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012049}
12050
INADA Naoki3ae20562017-01-16 20:41:20 +090012051/*[clinic input]
12052str.isalpha as unicode_isalpha
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012053
INADA Naoki3ae20562017-01-16 20:41:20 +090012054Return True if the string is an alphabetic string, False otherwise.
12055
12056A string is alphabetic if all characters in the string are alphabetic and there
12057is at least one character in the string.
12058[clinic start generated code]*/
12059
12060static PyObject *
12061unicode_isalpha_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012062/*[clinic end generated code: output=cc81b9ac3883ec4f input=d0fd18a96cbca5eb]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012063{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012064 Py_ssize_t i, length;
12065 int kind;
12066 void *data;
12067
12068 if (PyUnicode_READY(self) == -1)
12069 return NULL;
12070 length = PyUnicode_GET_LENGTH(self);
12071 kind = PyUnicode_KIND(self);
12072 data = PyUnicode_DATA(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012073
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012074 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012075 if (length == 1)
12076 return PyBool_FromLong(
12077 Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012078
12079 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012080 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012081 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012082
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012083 for (i = 0; i < length; i++) {
12084 if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012085 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012086 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012087 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012088}
12089
INADA Naoki3ae20562017-01-16 20:41:20 +090012090/*[clinic input]
12091str.isalnum as unicode_isalnum
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012092
INADA Naoki3ae20562017-01-16 20:41:20 +090012093Return True if the string is an alpha-numeric string, False otherwise.
12094
12095A string is alpha-numeric if all characters in the string are alpha-numeric and
12096there is at least one character in the string.
12097[clinic start generated code]*/
12098
12099static PyObject *
12100unicode_isalnum_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012101/*[clinic end generated code: output=a5a23490ffc3660c input=5c6579bf2e04758c]*/
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012102{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012103 int kind;
12104 void *data;
12105 Py_ssize_t len, i;
12106
12107 if (PyUnicode_READY(self) == -1)
12108 return NULL;
12109
12110 kind = PyUnicode_KIND(self);
12111 data = PyUnicode_DATA(self);
12112 len = PyUnicode_GET_LENGTH(self);
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012113
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012114 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012115 if (len == 1) {
12116 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12117 return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
12118 }
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012119
12120 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012121 if (len == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012122 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012123
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012124 for (i = 0; i < len; i++) {
12125 const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
Ezio Melotti93e7afc2011-08-22 14:08:38 +030012126 if (!Py_UNICODE_ISALNUM(ch))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012127 Py_RETURN_FALSE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012128 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012129 Py_RETURN_TRUE;
Marc-André Lemburga7acf422000-07-05 09:49:44 +000012130}
12131
INADA Naoki3ae20562017-01-16 20:41:20 +090012132/*[clinic input]
12133str.isdecimal as unicode_isdecimal
Guido van Rossumd57fd912000-03-10 22:53:23 +000012134
INADA Naoki3ae20562017-01-16 20:41:20 +090012135Return True if the string is a decimal string, False otherwise.
12136
12137A string is a decimal string if all characters in the string are decimal and
12138there is at least one character in the string.
12139[clinic start generated code]*/
12140
12141static PyObject *
12142unicode_isdecimal_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012143/*[clinic end generated code: output=fb2dcdb62d3fc548 input=336bc97ab4c8268f]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012144{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012145 Py_ssize_t i, length;
12146 int kind;
12147 void *data;
12148
12149 if (PyUnicode_READY(self) == -1)
12150 return NULL;
12151 length = PyUnicode_GET_LENGTH(self);
12152 kind = PyUnicode_KIND(self);
12153 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012154
Guido van Rossumd57fd912000-03-10 22:53:23 +000012155 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012156 if (length == 1)
12157 return PyBool_FromLong(
12158 Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012159
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012160 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012161 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012162 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012163
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012164 for (i = 0; i < length; i++) {
12165 if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012166 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012167 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012168 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012169}
12170
INADA Naoki3ae20562017-01-16 20:41:20 +090012171/*[clinic input]
12172str.isdigit as unicode_isdigit
Guido van Rossumd57fd912000-03-10 22:53:23 +000012173
INADA Naoki3ae20562017-01-16 20:41:20 +090012174Return True if the string is a digit string, False otherwise.
12175
12176A string is a digit string if all characters in the string are digits and there
12177is at least one character in the string.
12178[clinic start generated code]*/
12179
12180static PyObject *
12181unicode_isdigit_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012182/*[clinic end generated code: output=10a6985311da6858 input=901116c31deeea4c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012183{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012184 Py_ssize_t i, length;
12185 int kind;
12186 void *data;
12187
12188 if (PyUnicode_READY(self) == -1)
12189 return NULL;
12190 length = PyUnicode_GET_LENGTH(self);
12191 kind = PyUnicode_KIND(self);
12192 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012193
Guido van Rossumd57fd912000-03-10 22:53:23 +000012194 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012195 if (length == 1) {
12196 const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
12197 return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
12198 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012199
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012200 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012201 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012202 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012203
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012204 for (i = 0; i < length; i++) {
12205 if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012206 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012207 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012208 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012209}
12210
INADA Naoki3ae20562017-01-16 20:41:20 +090012211/*[clinic input]
12212str.isnumeric as unicode_isnumeric
Guido van Rossumd57fd912000-03-10 22:53:23 +000012213
INADA Naoki3ae20562017-01-16 20:41:20 +090012214Return True if the string is a numeric string, False otherwise.
12215
12216A string is numeric if all characters in the string are numeric and there is at
12217least one character in the string.
12218[clinic start generated code]*/
12219
12220static PyObject *
12221unicode_isnumeric_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012222/*[clinic end generated code: output=9172a32d9013051a input=722507db976f826c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012223{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012224 Py_ssize_t i, length;
12225 int kind;
12226 void *data;
12227
12228 if (PyUnicode_READY(self) == -1)
12229 return NULL;
12230 length = PyUnicode_GET_LENGTH(self);
12231 kind = PyUnicode_KIND(self);
12232 data = PyUnicode_DATA(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012233
Guido van Rossumd57fd912000-03-10 22:53:23 +000012234 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012235 if (length == 1)
12236 return PyBool_FromLong(
12237 Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012238
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012239 /* Special case for empty strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012240 if (length == 0)
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012241 Py_RETURN_FALSE;
Marc-André Lemburg60bc8092000-06-14 09:18:32 +000012242
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012243 for (i = 0; i < length; i++) {
12244 if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012245 Py_RETURN_FALSE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012246 }
Serhiy Storchaka370fd202017-03-08 20:47:48 +020012247 Py_RETURN_TRUE;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012248}
12249
Martin v. Löwis47383402007-08-15 07:32:56 +000012250int
12251PyUnicode_IsIdentifier(PyObject *self)
12252{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012253 Py_ssize_t i;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012254 int ready = PyUnicode_IS_READY(self);
Martin v. Löwis47383402007-08-15 07:32:56 +000012255
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012256 Py_ssize_t len = ready ? PyUnicode_GET_LENGTH(self) : PyUnicode_GET_SIZE(self);
12257 if (len == 0) {
12258 /* an empty string is not a valid identifier */
Benjamin Peterson29060642009-01-31 22:14:21 +000012259 return 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012260 }
12261
Hai Shi3d235f52020-02-17 21:41:15 +080012262 int kind = 0;
12263 void *data = NULL;
Andy Lester933fc53f2020-02-20 22:51:47 -060012264 const wchar_t *wstr = NULL;
12265 Py_UCS4 ch;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012266 if (ready) {
12267 kind = PyUnicode_KIND(self);
12268 data = PyUnicode_DATA(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012269 ch = PyUnicode_READ(kind, data, 0);
12270 }
12271 else {
Andy Lester933fc53f2020-02-20 22:51:47 -060012272 wstr = _PyUnicode_WSTR(self);
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012273 ch = wstr[0];
12274 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012275 /* PEP 3131 says that the first character must be in
12276 XID_Start and subsequent characters in XID_Continue,
12277 and for the ASCII range, the 2.x rules apply (i.e
Benjamin Peterson14339b62009-01-31 16:36:08 +000012278 start with letters and underscore, continue with
Martin v. Löwis47383402007-08-15 07:32:56 +000012279 letters, digits, underscore). However, given the current
12280 definition of XID_Start and XID_Continue, it is sufficient
12281 to check just for these, except that _ must be allowed
12282 as starting an identifier. */
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012283 if (!_PyUnicode_IsXidStart(ch) && ch != 0x5F /* LOW LINE */) {
Martin v. Löwis47383402007-08-15 07:32:56 +000012284 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012285 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012286
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012287 for (i = 1; i < len; i++) {
12288 if (ready) {
12289 ch = PyUnicode_READ(kind, data, i);
12290 }
12291 else {
12292 ch = wstr[i];
12293 }
12294 if (!_PyUnicode_IsXidContinue(ch)) {
Benjamin Peterson29060642009-01-31 22:14:21 +000012295 return 0;
Victor Stinnerf3e7ea52020-02-11 14:29:33 +010012296 }
12297 }
Martin v. Löwis47383402007-08-15 07:32:56 +000012298 return 1;
12299}
12300
INADA Naoki3ae20562017-01-16 20:41:20 +090012301/*[clinic input]
12302str.isidentifier as unicode_isidentifier
Martin v. Löwis47383402007-08-15 07:32:56 +000012303
INADA Naoki3ae20562017-01-16 20:41:20 +090012304Return True if the string is a valid Python identifier, False otherwise.
12305
Sanyam Khuranaffc5a142018-10-08 12:23:32 +053012306Call keyword.iskeyword(s) to test whether string s is a reserved identifier,
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012307such as "def" or "class".
INADA Naoki3ae20562017-01-16 20:41:20 +090012308[clinic start generated code]*/
12309
12310static PyObject *
12311unicode_isidentifier_impl(PyObject *self)
Emanuele Gaifasfc8205c2018-10-08 12:44:47 +020012312/*[clinic end generated code: output=fe585a9666572905 input=2d807a104f21c0c5]*/
Martin v. Löwis47383402007-08-15 07:32:56 +000012313{
12314 return PyBool_FromLong(PyUnicode_IsIdentifier(self));
12315}
12316
INADA Naoki3ae20562017-01-16 20:41:20 +090012317/*[clinic input]
12318str.isprintable as unicode_isprintable
Georg Brandl559e5d72008-06-11 18:37:52 +000012319
INADA Naoki3ae20562017-01-16 20:41:20 +090012320Return True if the string is printable, False otherwise.
12321
12322A string is printable if all of its characters are considered printable in
12323repr() or if it is empty.
12324[clinic start generated code]*/
12325
12326static PyObject *
12327unicode_isprintable_impl(PyObject *self)
INADA Naoki15f94592017-01-16 21:49:13 +090012328/*[clinic end generated code: output=3ab9626cd32dd1a0 input=98a0e1c2c1813209]*/
Georg Brandl559e5d72008-06-11 18:37:52 +000012329{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012330 Py_ssize_t i, length;
12331 int kind;
12332 void *data;
12333
12334 if (PyUnicode_READY(self) == -1)
12335 return NULL;
12336 length = PyUnicode_GET_LENGTH(self);
12337 kind = PyUnicode_KIND(self);
12338 data = PyUnicode_DATA(self);
Georg Brandl559e5d72008-06-11 18:37:52 +000012339
12340 /* Shortcut for single character strings */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012341 if (length == 1)
12342 return PyBool_FromLong(
12343 Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
Georg Brandl559e5d72008-06-11 18:37:52 +000012344
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012345 for (i = 0; i < length; i++) {
12346 if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
Georg Brandl559e5d72008-06-11 18:37:52 +000012347 Py_RETURN_FALSE;
12348 }
12349 }
12350 Py_RETURN_TRUE;
12351}
12352
INADA Naoki3ae20562017-01-16 20:41:20 +090012353/*[clinic input]
12354str.join as unicode_join
Guido van Rossumd57fd912000-03-10 22:53:23 +000012355
INADA Naoki3ae20562017-01-16 20:41:20 +090012356 iterable: object
12357 /
12358
12359Concatenate any number of strings.
12360
Martin Panter91a88662017-01-24 00:30:06 +000012361The string whose method is called is inserted in between each given string.
INADA Naoki3ae20562017-01-16 20:41:20 +090012362The result is returned as a new string.
12363
12364Example: '.'.join(['ab', 'pq', 'rs']) -> 'ab.pq.rs'
12365[clinic start generated code]*/
12366
12367static PyObject *
12368unicode_join(PyObject *self, PyObject *iterable)
Martin Panter91a88662017-01-24 00:30:06 +000012369/*[clinic end generated code: output=6857e7cecfe7bf98 input=2f70422bfb8fa189]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012370{
INADA Naoki3ae20562017-01-16 20:41:20 +090012371 return PyUnicode_Join(self, iterable);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012372}
12373
Martin v. Löwis18e16552006-02-15 17:27:45 +000012374static Py_ssize_t
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012375unicode_length(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012376{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012377 if (PyUnicode_READY(self) == -1)
12378 return -1;
12379 return PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012380}
12381
INADA Naoki3ae20562017-01-16 20:41:20 +090012382/*[clinic input]
12383str.ljust as unicode_ljust
12384
12385 width: Py_ssize_t
12386 fillchar: Py_UCS4 = ' '
12387 /
12388
12389Return a left-justified string of length width.
12390
12391Padding is done using the specified fill character (default is a space).
12392[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012393
12394static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012395unicode_ljust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
12396/*[clinic end generated code: output=1cce0e0e0a0b84b3 input=3ab599e335e60a32]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012397{
Benjamin Petersonbac79492012-01-14 13:34:47 -050012398 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012399 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012400
Victor Stinnerc4b49542011-12-11 22:44:26 +010012401 if (PyUnicode_GET_LENGTH(self) >= width)
12402 return unicode_result_unchanged(self);
12403
12404 return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012405}
12406
INADA Naoki3ae20562017-01-16 20:41:20 +090012407/*[clinic input]
12408str.lower as unicode_lower
Guido van Rossumd57fd912000-03-10 22:53:23 +000012409
INADA Naoki3ae20562017-01-16 20:41:20 +090012410Return a copy of the string converted to lowercase.
12411[clinic start generated code]*/
12412
12413static PyObject *
12414unicode_lower_impl(PyObject *self)
12415/*[clinic end generated code: output=84ef9ed42efad663 input=60a2984b8beff23a]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012416{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050012417 if (PyUnicode_READY(self) == -1)
12418 return NULL;
12419 if (PyUnicode_IS_ASCII(self))
12420 return ascii_upper_or_lower(self, 1);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010012421 return case_operation(self, do_lower);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012422}
12423
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012424#define LEFTSTRIP 0
12425#define RIGHTSTRIP 1
12426#define BOTHSTRIP 2
12427
12428/* Arrays indexed by above */
INADA Naoki3ae20562017-01-16 20:41:20 +090012429static const char *stripfuncnames[] = {"lstrip", "rstrip", "strip"};
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012430
INADA Naoki3ae20562017-01-16 20:41:20 +090012431#define STRIPNAME(i) (stripfuncnames[i])
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012432
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012433/* externally visible for str.strip(unicode) */
12434PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012435_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012436{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012437 void *data;
12438 int kind;
12439 Py_ssize_t i, j, len;
12440 BLOOM_MASK sepmask;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012441 Py_ssize_t seplen;
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012442
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012443 if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12444 return NULL;
12445
12446 kind = PyUnicode_KIND(self);
12447 data = PyUnicode_DATA(self);
12448 len = PyUnicode_GET_LENGTH(self);
Victor Stinnerb3a60142013-04-09 22:19:21 +020012449 seplen = PyUnicode_GET_LENGTH(sepobj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012450 sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12451 PyUnicode_DATA(sepobj),
Victor Stinnerb3a60142013-04-09 22:19:21 +020012452 seplen);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012453
Benjamin Peterson14339b62009-01-31 16:36:08 +000012454 i = 0;
12455 if (striptype != RIGHTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012456 while (i < len) {
12457 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12458 if (!BLOOM(sepmask, ch))
12459 break;
12460 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12461 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012462 i++;
12463 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012464 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012465
Benjamin Peterson14339b62009-01-31 16:36:08 +000012466 j = len;
12467 if (striptype != LEFTSTRIP) {
Victor Stinnerb3a60142013-04-09 22:19:21 +020012468 j--;
12469 while (j >= i) {
12470 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12471 if (!BLOOM(sepmask, ch))
12472 break;
12473 if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12474 break;
Benjamin Peterson29060642009-01-31 22:14:21 +000012475 j--;
Victor Stinnerb3a60142013-04-09 22:19:21 +020012476 }
12477
Benjamin Peterson29060642009-01-31 22:14:21 +000012478 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012479 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012480
Victor Stinner7931d9a2011-11-04 00:22:48 +010012481 return PyUnicode_Substring(self, i, j);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012482}
12483
12484PyObject*
12485PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12486{
12487 unsigned char *data;
12488 int kind;
Victor Stinner12bab6d2011-10-01 01:53:49 +020012489 Py_ssize_t length;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012490
Victor Stinnerde636f32011-10-01 03:55:54 +020012491 if (PyUnicode_READY(self) == -1)
12492 return NULL;
12493
Victor Stinner684d5fd2012-05-03 02:32:34 +020012494 length = PyUnicode_GET_LENGTH(self);
12495 end = Py_MIN(end, length);
Victor Stinnerde636f32011-10-01 03:55:54 +020012496
Victor Stinner684d5fd2012-05-03 02:32:34 +020012497 if (start == 0 && end == length)
Victor Stinnerc4b49542011-12-11 22:44:26 +010012498 return unicode_result_unchanged(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012499
Victor Stinnerde636f32011-10-01 03:55:54 +020012500 if (start < 0 || end < 0) {
Victor Stinner12bab6d2011-10-01 01:53:49 +020012501 PyErr_SetString(PyExc_IndexError, "string index out of range");
12502 return NULL;
12503 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020012504 if (start >= length || end < start)
12505 _Py_RETURN_UNICODE_EMPTY();
Victor Stinner12bab6d2011-10-01 01:53:49 +020012506
Victor Stinner684d5fd2012-05-03 02:32:34 +020012507 length = end - start;
Victor Stinnerb9275c12011-10-05 14:01:42 +020012508 if (PyUnicode_IS_ASCII(self)) {
Victor Stinnerb9275c12011-10-05 14:01:42 +020012509 data = PyUnicode_1BYTE_DATA(self);
Victor Stinnerd3f08822012-05-29 12:57:52 +020012510 return _PyUnicode_FromASCII((char*)(data + start), length);
Victor Stinnerb9275c12011-10-05 14:01:42 +020012511 }
12512 else {
12513 kind = PyUnicode_KIND(self);
12514 data = PyUnicode_1BYTE_DATA(self);
12515 return PyUnicode_FromKindAndData(kind,
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012516 data + kind * start,
Victor Stinnerb9275c12011-10-05 14:01:42 +020012517 length);
12518 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012519}
Guido van Rossumd57fd912000-03-10 22:53:23 +000012520
12521static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012522do_strip(PyObject *self, int striptype)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012523{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012524 Py_ssize_t len, i, j;
12525
12526 if (PyUnicode_READY(self) == -1)
12527 return NULL;
12528
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012529 len = PyUnicode_GET_LENGTH(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012530
Victor Stinnercc7af722013-04-09 22:39:24 +020012531 if (PyUnicode_IS_ASCII(self)) {
12532 Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12533
12534 i = 0;
12535 if (striptype != RIGHTSTRIP) {
12536 while (i < len) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012537 Py_UCS1 ch = data[i];
Victor Stinnercc7af722013-04-09 22:39:24 +020012538 if (!_Py_ascii_whitespace[ch])
12539 break;
12540 i++;
12541 }
12542 }
12543
12544 j = len;
12545 if (striptype != LEFTSTRIP) {
12546 j--;
12547 while (j >= i) {
Victor Stinnerd92e0782013-04-14 19:17:42 +020012548 Py_UCS1 ch = data[j];
Victor Stinnercc7af722013-04-09 22:39:24 +020012549 if (!_Py_ascii_whitespace[ch])
12550 break;
12551 j--;
12552 }
12553 j++;
Benjamin Peterson14339b62009-01-31 16:36:08 +000012554 }
12555 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012556 else {
12557 int kind = PyUnicode_KIND(self);
12558 void *data = PyUnicode_DATA(self);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012559
Victor Stinnercc7af722013-04-09 22:39:24 +020012560 i = 0;
12561 if (striptype != RIGHTSTRIP) {
12562 while (i < len) {
12563 Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12564 if (!Py_UNICODE_ISSPACE(ch))
12565 break;
12566 i++;
12567 }
Victor Stinner9c79e412013-04-09 22:21:08 +020012568 }
Victor Stinnercc7af722013-04-09 22:39:24 +020012569
12570 j = len;
12571 if (striptype != LEFTSTRIP) {
12572 j--;
12573 while (j >= i) {
12574 Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12575 if (!Py_UNICODE_ISSPACE(ch))
12576 break;
12577 j--;
12578 }
12579 j++;
12580 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000012581 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012582
Victor Stinner7931d9a2011-11-04 00:22:48 +010012583 return PyUnicode_Substring(self, i, j);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012584}
12585
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012586
12587static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012588do_argstrip(PyObject *self, int striptype, PyObject *sep)
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012589{
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012590 if (sep != Py_None) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000012591 if (PyUnicode_Check(sep))
12592 return _PyUnicode_XStrip(self, striptype, sep);
12593 else {
12594 PyErr_Format(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000012595 "%s arg must be None or str",
12596 STRIPNAME(striptype));
Benjamin Peterson14339b62009-01-31 16:36:08 +000012597 return NULL;
12598 }
12599 }
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012600
Benjamin Peterson14339b62009-01-31 16:36:08 +000012601 return do_strip(self, striptype);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012602}
12603
12604
INADA Naoki3ae20562017-01-16 20:41:20 +090012605/*[clinic input]
12606str.strip as unicode_strip
12607
12608 chars: object = None
12609 /
12610
Zachary Ware09895c22019-10-09 16:09:00 -050012611Return a copy of the string with leading and trailing whitespace removed.
INADA Naoki3ae20562017-01-16 20:41:20 +090012612
12613If chars is given and not None, remove characters in chars instead.
12614[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012615
12616static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012617unicode_strip_impl(PyObject *self, PyObject *chars)
Zachary Ware09895c22019-10-09 16:09:00 -050012618/*[clinic end generated code: output=ca19018454345d57 input=385289c6f423b954]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012619{
INADA Naoki3ae20562017-01-16 20:41:20 +090012620 return do_argstrip(self, BOTHSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012621}
12622
12623
INADA Naoki3ae20562017-01-16 20:41:20 +090012624/*[clinic input]
12625str.lstrip as unicode_lstrip
12626
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012627 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012628 /
12629
12630Return a copy of the string with leading whitespace removed.
12631
12632If chars is given and not None, remove characters in chars instead.
12633[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012634
12635static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012636unicode_lstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012637/*[clinic end generated code: output=3b43683251f79ca7 input=529f9f3834448671]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012638{
INADA Naoki3ae20562017-01-16 20:41:20 +090012639 return do_argstrip(self, LEFTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012640}
12641
12642
INADA Naoki3ae20562017-01-16 20:41:20 +090012643/*[clinic input]
12644str.rstrip as unicode_rstrip
12645
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012646 chars: object = None
INADA Naoki3ae20562017-01-16 20:41:20 +090012647 /
12648
12649Return a copy of the string with trailing whitespace removed.
12650
12651If chars is given and not None, remove characters in chars instead.
12652[clinic start generated code]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012653
12654static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090012655unicode_rstrip_impl(PyObject *self, PyObject *chars)
Serhiy Storchaka279f4462019-09-14 12:24:05 +030012656/*[clinic end generated code: output=4a59230017cc3b7a input=62566c627916557f]*/
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012657{
INADA Naoki3ae20562017-01-16 20:41:20 +090012658 return do_argstrip(self, RIGHTSTRIP, chars);
Walter Dörwaldde02bcb2002-04-22 17:42:37 +000012659}
12660
12661
Guido van Rossumd57fd912000-03-10 22:53:23 +000012662static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012663unicode_repeat(PyObject *str, Py_ssize_t len)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012664{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012665 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012666 Py_ssize_t nchars, n;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012667
Serhiy Storchaka05997252013-01-26 12:14:02 +020012668 if (len < 1)
12669 _Py_RETURN_UNICODE_EMPTY();
Guido van Rossumd57fd912000-03-10 22:53:23 +000012670
Victor Stinnerc4b49542011-12-11 22:44:26 +010012671 /* no repeat, return original string */
12672 if (len == 1)
12673 return unicode_result_unchanged(str);
Tim Peters8f422462000-09-09 06:13:41 +000012674
Benjamin Petersonbac79492012-01-14 13:34:47 -050012675 if (PyUnicode_READY(str) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012676 return NULL;
12677
Victor Stinnerc759f3e2011-10-01 03:09:58 +020012678 if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
Victor Stinner67ca64c2011-10-01 02:47:29 +020012679 PyErr_SetString(PyExc_OverflowError,
12680 "repeated string is too long");
12681 return NULL;
12682 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012683 nchars = len * PyUnicode_GET_LENGTH(str);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012684
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012685 u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012686 if (!u)
12687 return NULL;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012688 assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
Guido van Rossumd57fd912000-03-10 22:53:23 +000012689
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012690 if (PyUnicode_GET_LENGTH(str) == 1) {
12691 const int kind = PyUnicode_KIND(str);
12692 const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
Victor Stinner73f53b52011-12-18 03:26:31 +010012693 if (kind == PyUnicode_1BYTE_KIND) {
12694 void *to = PyUnicode_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012695 memset(to, (unsigned char)fill_char, len);
Victor Stinner73f53b52011-12-18 03:26:31 +010012696 }
12697 else if (kind == PyUnicode_2BYTE_KIND) {
12698 Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
Victor Stinner67ca64c2011-10-01 02:47:29 +020012699 for (n = 0; n < len; ++n)
Victor Stinner73f53b52011-12-18 03:26:31 +010012700 ucs2[n] = fill_char;
12701 } else {
12702 Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12703 assert(kind == PyUnicode_4BYTE_KIND);
12704 for (n = 0; n < len; ++n)
12705 ucs4[n] = fill_char;
Victor Stinner67ca64c2011-10-01 02:47:29 +020012706 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012707 }
12708 else {
12709 /* number of characters copied this far */
12710 Py_ssize_t done = PyUnicode_GET_LENGTH(str);
Martin v. Löwisc47adb02011-10-07 20:55:35 +020012711 const Py_ssize_t char_size = PyUnicode_KIND(str);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012712 char *to = (char *) PyUnicode_DATA(u);
Christian Heimesf051e432016-09-13 20:22:02 +020012713 memcpy(to, PyUnicode_DATA(str),
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012714 PyUnicode_GET_LENGTH(str) * char_size);
Benjamin Peterson29060642009-01-31 22:14:21 +000012715 while (done < nchars) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012716 n = (done <= nchars-done) ? done : nchars-done;
Christian Heimesf051e432016-09-13 20:22:02 +020012717 memcpy(to + (done * char_size), to, n * char_size);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012718 done += n;
Benjamin Peterson29060642009-01-31 22:14:21 +000012719 }
Guido van Rossumd57fd912000-03-10 22:53:23 +000012720 }
12721
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020012722 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner9db1a8b2011-10-23 20:04:37 +020012723 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012724}
12725
Alexander Belopolsky40018472011-02-26 01:02:56 +000012726PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012727PyUnicode_Replace(PyObject *str,
12728 PyObject *substr,
12729 PyObject *replstr,
Alexander Belopolsky40018472011-02-26 01:02:56 +000012730 Py_ssize_t maxcount)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012731{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012732 if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0 ||
12733 ensure_unicode(replstr) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000012734 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012735 return replace(str, substr, replstr, maxcount);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012736}
12737
INADA Naoki3ae20562017-01-16 20:41:20 +090012738/*[clinic input]
12739str.replace as unicode_replace
Guido van Rossumd57fd912000-03-10 22:53:23 +000012740
INADA Naoki3ae20562017-01-16 20:41:20 +090012741 old: unicode
12742 new: unicode
12743 count: Py_ssize_t = -1
12744 Maximum number of occurrences to replace.
12745 -1 (the default value) means replace all occurrences.
12746 /
12747
12748Return a copy with all occurrences of substring old replaced by new.
12749
12750If the optional argument count is given, only the first count occurrences are
12751replaced.
12752[clinic start generated code]*/
12753
12754static PyObject *
12755unicode_replace_impl(PyObject *self, PyObject *old, PyObject *new,
12756 Py_ssize_t count)
12757/*[clinic end generated code: output=b63f1a8b5eebf448 input=147d12206276ebeb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000012758{
Benjamin Peterson22a29702012-01-02 09:00:30 -060012759 if (PyUnicode_READY(self) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000012760 return NULL;
INADA Naoki3ae20562017-01-16 20:41:20 +090012761 return replace(self, old, new, count);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012762}
12763
Alexander Belopolsky40018472011-02-26 01:02:56 +000012764static PyObject *
12765unicode_repr(PyObject *unicode)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012766{
Walter Dörwald79e913e2007-05-12 11:08:06 +000012767 PyObject *repr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012768 Py_ssize_t isize;
12769 Py_ssize_t osize, squote, dquote, i, o;
12770 Py_UCS4 max, quote;
Victor Stinner55c08782013-04-14 18:45:39 +020012771 int ikind, okind, unchanged;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012772 void *idata, *odata;
Walter Dörwald79e913e2007-05-12 11:08:06 +000012773
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012774 if (PyUnicode_READY(unicode) == -1)
Walter Dörwald79e913e2007-05-12 11:08:06 +000012775 return NULL;
12776
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012777 isize = PyUnicode_GET_LENGTH(unicode);
12778 idata = PyUnicode_DATA(unicode);
Walter Dörwald79e913e2007-05-12 11:08:06 +000012779
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012780 /* Compute length of output, quote characters, and
12781 maximum character */
Victor Stinner55c08782013-04-14 18:45:39 +020012782 osize = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012783 max = 127;
12784 squote = dquote = 0;
12785 ikind = PyUnicode_KIND(unicode);
12786 for (i = 0; i < isize; i++) {
12787 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Benjamin Peterson736b8012014-09-29 23:02:15 -040012788 Py_ssize_t incr = 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012789 switch (ch) {
Benjamin Peterson736b8012014-09-29 23:02:15 -040012790 case '\'': squote++; break;
12791 case '"': dquote++; break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012792 case '\\': case '\t': case '\r': case '\n':
Benjamin Peterson736b8012014-09-29 23:02:15 -040012793 incr = 2;
12794 break;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012795 default:
12796 /* Fast-path ASCII */
12797 if (ch < ' ' || ch == 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012798 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012799 else if (ch < 0x7f)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012800 ;
12801 else if (Py_UNICODE_ISPRINTABLE(ch))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012802 max = ch > max ? ch : max;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012803 else if (ch < 0x100)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012804 incr = 4; /* \xHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012805 else if (ch < 0x10000)
Benjamin Peterson736b8012014-09-29 23:02:15 -040012806 incr = 6; /* \uHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012807 else
Benjamin Peterson736b8012014-09-29 23:02:15 -040012808 incr = 10; /* \uHHHHHHHH */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012809 }
Benjamin Peterson736b8012014-09-29 23:02:15 -040012810 if (osize > PY_SSIZE_T_MAX - incr) {
12811 PyErr_SetString(PyExc_OverflowError,
12812 "string is too long to generate repr");
12813 return NULL;
12814 }
12815 osize += incr;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012816 }
12817
12818 quote = '\'';
Victor Stinner55c08782013-04-14 18:45:39 +020012819 unchanged = (osize == isize);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012820 if (squote) {
Victor Stinner55c08782013-04-14 18:45:39 +020012821 unchanged = 0;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012822 if (dquote)
12823 /* Both squote and dquote present. Use squote,
12824 and escape them */
12825 osize += squote;
12826 else
12827 quote = '"';
12828 }
Victor Stinner55c08782013-04-14 18:45:39 +020012829 osize += 2; /* quotes */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012830
12831 repr = PyUnicode_New(osize, max);
12832 if (repr == NULL)
12833 return NULL;
12834 okind = PyUnicode_KIND(repr);
12835 odata = PyUnicode_DATA(repr);
12836
12837 PyUnicode_WRITE(okind, odata, 0, quote);
12838 PyUnicode_WRITE(okind, odata, osize-1, quote);
Victor Stinner55c08782013-04-14 18:45:39 +020012839 if (unchanged) {
12840 _PyUnicode_FastCopyCharacters(repr, 1,
12841 unicode, 0,
12842 isize);
12843 }
12844 else {
12845 for (i = 0, o = 1; i < isize; i++) {
12846 Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012847
Victor Stinner55c08782013-04-14 18:45:39 +020012848 /* Escape quotes and backslashes */
12849 if ((ch == quote) || (ch == '\\')) {
Kristján Valur Jónsson55e5dc82012-06-06 21:58:08 +000012850 PyUnicode_WRITE(okind, odata, o++, '\\');
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012851 PyUnicode_WRITE(okind, odata, o++, ch);
Victor Stinner55c08782013-04-14 18:45:39 +020012852 continue;
12853 }
12854
12855 /* Map special whitespace to '\t', \n', '\r' */
12856 if (ch == '\t') {
12857 PyUnicode_WRITE(okind, odata, o++, '\\');
12858 PyUnicode_WRITE(okind, odata, o++, 't');
12859 }
12860 else if (ch == '\n') {
12861 PyUnicode_WRITE(okind, odata, o++, '\\');
12862 PyUnicode_WRITE(okind, odata, o++, 'n');
12863 }
12864 else if (ch == '\r') {
12865 PyUnicode_WRITE(okind, odata, o++, '\\');
12866 PyUnicode_WRITE(okind, odata, o++, 'r');
12867 }
12868
12869 /* Map non-printable US ASCII to '\xhh' */
12870 else if (ch < ' ' || ch == 0x7F) {
12871 PyUnicode_WRITE(okind, odata, o++, '\\');
12872 PyUnicode_WRITE(okind, odata, o++, 'x');
12873 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12874 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12875 }
12876
12877 /* Copy ASCII characters as-is */
12878 else if (ch < 0x7F) {
12879 PyUnicode_WRITE(okind, odata, o++, ch);
12880 }
12881
12882 /* Non-ASCII characters */
12883 else {
12884 /* Map Unicode whitespace and control characters
12885 (categories Z* and C* except ASCII space)
12886 */
12887 if (!Py_UNICODE_ISPRINTABLE(ch)) {
12888 PyUnicode_WRITE(okind, odata, o++, '\\');
12889 /* Map 8-bit characters to '\xhh' */
12890 if (ch <= 0xff) {
12891 PyUnicode_WRITE(okind, odata, o++, 'x');
12892 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12893 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12894 }
12895 /* Map 16-bit characters to '\uxxxx' */
12896 else if (ch <= 0xffff) {
12897 PyUnicode_WRITE(okind, odata, o++, 'u');
12898 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12899 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12900 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12901 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12902 }
12903 /* Map 21-bit characters to '\U00xxxxxx' */
12904 else {
12905 PyUnicode_WRITE(okind, odata, o++, 'U');
12906 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12907 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12908 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12909 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12910 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12911 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12912 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12913 PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12914 }
12915 }
12916 /* Copy characters as-is */
12917 else {
12918 PyUnicode_WRITE(okind, odata, o++, ch);
12919 }
Georg Brandl559e5d72008-06-11 18:37:52 +000012920 }
12921 }
Walter Dörwald79e913e2007-05-12 11:08:06 +000012922 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012923 /* Closing quote already added at the beginning */
Victor Stinner05d11892011-10-06 01:13:58 +020012924 assert(_PyUnicode_CheckConsistency(repr, 1));
Walter Dörwald79e913e2007-05-12 11:08:06 +000012925 return repr;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012926}
12927
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012928PyDoc_STRVAR(rfind__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012929 "S.rfind(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012930\n\
12931Return the highest index in S where substring sub is found,\n\
Senthil Kumaran53516a82011-07-27 23:33:54 +080012932such that sub is contained within S[start:end]. Optional\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012933arguments start and end are interpreted as in slice notation.\n\
12934\n\
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012935Return -1 on failure.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012936
12937static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012938unicode_rfind(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012939{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012940 /* initialize variables to prevent gcc warning */
12941 PyObject *substring = NULL;
12942 Py_ssize_t start = 0;
12943 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012944 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012945
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012946 if (!parse_args_finds_unicode("rfind", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012947 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012948
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012949 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012950 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012951
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012952 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012953
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012954 if (result == -2)
12955 return NULL;
12956
Christian Heimes217cfd12007-12-02 14:31:20 +000012957 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012958}
12959
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000012960PyDoc_STRVAR(rindex__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000012961 "S.rindex(sub[, start[, end]]) -> int\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000012962\n\
Lisa Roach43ba8862017-04-04 22:36:22 -070012963Return the highest index in S where substring sub is found,\n\
12964such that sub is contained within S[start:end]. Optional\n\
12965arguments start and end are interpreted as in slice notation.\n\
12966\n\
12967Raises ValueError when the substring is not found.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000012968
12969static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012970unicode_rindex(PyObject *self, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000012971{
Victor Stinner0c39b1b2015-03-18 15:02:06 +010012972 /* initialize variables to prevent gcc warning */
12973 PyObject *substring = NULL;
12974 Py_ssize_t start = 0;
12975 Py_ssize_t end = 0;
Thomas Wouters477c8d52006-05-27 19:21:47 +000012976 Py_ssize_t result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012977
Serhiy Storchakadd40fc32016-05-04 22:23:26 +030012978 if (!parse_args_finds_unicode("rindex", args, &substring, &start, &end))
Benjamin Peterson14339b62009-01-31 16:36:08 +000012979 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000012980
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012981 if (PyUnicode_READY(self) == -1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012982 return NULL;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012983
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030012984 result = any_find_slice(self, substring, start, end, -1);
Thomas Wouters477c8d52006-05-27 19:21:47 +000012985
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012986 if (result == -2)
12987 return NULL;
12988
Guido van Rossumd57fd912000-03-10 22:53:23 +000012989 if (result < 0) {
12990 PyErr_SetString(PyExc_ValueError, "substring not found");
12991 return NULL;
12992 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020012993
Christian Heimes217cfd12007-12-02 14:31:20 +000012994 return PyLong_FromSsize_t(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000012995}
12996
INADA Naoki3ae20562017-01-16 20:41:20 +090012997/*[clinic input]
12998str.rjust as unicode_rjust
12999
13000 width: Py_ssize_t
13001 fillchar: Py_UCS4 = ' '
13002 /
13003
13004Return a right-justified string of length width.
13005
13006Padding is done using the specified fill character (default is a space).
13007[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013008
13009static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013010unicode_rjust_impl(PyObject *self, Py_ssize_t width, Py_UCS4 fillchar)
13011/*[clinic end generated code: output=804a1a57fbe8d5cf input=d05f550b5beb1f72]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013012{
Benjamin Petersonbac79492012-01-14 13:34:47 -050013013 if (PyUnicode_READY(self) == -1)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013014 return NULL;
13015
Victor Stinnerc4b49542011-12-11 22:44:26 +010013016 if (PyUnicode_GET_LENGTH(self) >= width)
13017 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013018
Victor Stinnerc4b49542011-12-11 22:44:26 +010013019 return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013020}
13021
Alexander Belopolsky40018472011-02-26 01:02:56 +000013022PyObject *
13023PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013024{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013025 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013026 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013027
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013028 return split(s, sep, maxsplit);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013029}
13030
INADA Naoki3ae20562017-01-16 20:41:20 +090013031/*[clinic input]
13032str.split as unicode_split
Guido van Rossumd57fd912000-03-10 22:53:23 +000013033
INADA Naoki3ae20562017-01-16 20:41:20 +090013034 sep: object = None
13035 The delimiter according which to split the string.
13036 None (the default value) means split according to any whitespace,
13037 and discard empty strings from the result.
13038 maxsplit: Py_ssize_t = -1
13039 Maximum number of splits to do.
13040 -1 (the default value) means no limit.
13041
13042Return a list of the words in the string, using sep as the delimiter string.
13043[clinic start generated code]*/
13044
13045static PyObject *
13046unicode_split_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13047/*[clinic end generated code: output=3a65b1db356948dc input=606e750488a82359]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013048{
INADA Naoki3ae20562017-01-16 20:41:20 +090013049 if (sep == Py_None)
13050 return split(self, NULL, maxsplit);
13051 if (PyUnicode_Check(sep))
13052 return split(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013053
Victor Stinner998b8062018-09-12 00:23:25 +020013054 PyErr_Format(PyExc_TypeError,
13055 "must be str or None, not %.100s",
13056 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013057 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013058}
13059
Thomas Wouters477c8d52006-05-27 19:21:47 +000013060PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013061PyUnicode_Partition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013062{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013063 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013064 int kind1, kind2;
13065 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013066 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013067
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013068 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013069 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013070
Victor Stinner14f8f022011-10-05 20:58:25 +020013071 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013072 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013073 len1 = PyUnicode_GET_LENGTH(str_obj);
13074 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013075 if (kind1 < kind2 || len1 < len2) {
13076 _Py_INCREF_UNICODE_EMPTY();
13077 if (!unicode_empty)
13078 out = NULL;
13079 else {
13080 out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
13081 Py_DECREF(unicode_empty);
13082 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013083 return out;
13084 }
13085 buf1 = PyUnicode_DATA(str_obj);
13086 buf2 = PyUnicode_DATA(sep_obj);
13087 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013088 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013089 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013090 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013091 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013092
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013093 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013094 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013095 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13096 out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13097 else
13098 out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013099 break;
13100 case PyUnicode_2BYTE_KIND:
13101 out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13102 break;
13103 case PyUnicode_4BYTE_KIND:
13104 out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
13105 break;
13106 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013107 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013108 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013109
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013110 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013111 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013112
13113 return out;
13114}
13115
13116
13117PyObject *
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013118PyUnicode_RPartition(PyObject *str_obj, PyObject *sep_obj)
Thomas Wouters477c8d52006-05-27 19:21:47 +000013119{
Thomas Wouters477c8d52006-05-27 19:21:47 +000013120 PyObject* out;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013121 int kind1, kind2;
13122 void *buf1, *buf2;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013123 Py_ssize_t len1, len2;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013124
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013125 if (ensure_unicode(str_obj) < 0 || ensure_unicode(sep_obj) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000013126 return NULL;
Thomas Wouters477c8d52006-05-27 19:21:47 +000013127
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013128 kind1 = PyUnicode_KIND(str_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013129 kind2 = PyUnicode_KIND(sep_obj);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013130 len1 = PyUnicode_GET_LENGTH(str_obj);
13131 len2 = PyUnicode_GET_LENGTH(sep_obj);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013132 if (kind1 < kind2 || len1 < len2) {
13133 _Py_INCREF_UNICODE_EMPTY();
13134 if (!unicode_empty)
13135 out = NULL;
13136 else {
13137 out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
13138 Py_DECREF(unicode_empty);
13139 }
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013140 return out;
13141 }
13142 buf1 = PyUnicode_DATA(str_obj);
13143 buf2 = PyUnicode_DATA(sep_obj);
13144 if (kind2 != kind1) {
Serhiy Storchaka17b47332020-04-01 15:41:49 +030013145 buf2 = unicode_askind(kind2, buf2, len2, kind1);
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013146 if (!buf2)
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013147 return NULL;
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013148 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013149
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013150 switch (kind1) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013151 case PyUnicode_1BYTE_KIND:
Victor Stinnerc3cec782011-10-05 21:24:08 +020013152 if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
13153 out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13154 else
13155 out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013156 break;
13157 case PyUnicode_2BYTE_KIND:
13158 out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13159 break;
13160 case PyUnicode_4BYTE_KIND:
13161 out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
13162 break;
13163 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013164 Py_UNREACHABLE();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013165 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000013166
Serhiy Storchakad9d769f2015-03-24 21:55:47 +020013167 if (kind2 != kind1)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013168 PyMem_Free(buf2);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013169
13170 return out;
13171}
13172
INADA Naoki3ae20562017-01-16 20:41:20 +090013173/*[clinic input]
13174str.partition as unicode_partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013175
INADA Naoki3ae20562017-01-16 20:41:20 +090013176 sep: object
13177 /
13178
13179Partition the string into three parts using the given separator.
13180
13181This will search for the separator in the string. If the separator is found,
13182returns a 3-tuple containing the part before the separator, the separator
13183itself, and the part after it.
13184
13185If the separator is not found, returns a 3-tuple containing the original string
13186and two empty strings.
13187[clinic start generated code]*/
13188
13189static PyObject *
13190unicode_partition(PyObject *self, PyObject *sep)
13191/*[clinic end generated code: output=e4ced7bd253ca3c4 input=f29b8d06c63e50be]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013192{
INADA Naoki3ae20562017-01-16 20:41:20 +090013193 return PyUnicode_Partition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013194}
13195
INADA Naoki3ae20562017-01-16 20:41:20 +090013196/*[clinic input]
13197str.rpartition as unicode_rpartition = str.partition
Thomas Wouters477c8d52006-05-27 19:21:47 +000013198
INADA Naoki3ae20562017-01-16 20:41:20 +090013199Partition the string into three parts using the given separator.
13200
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013201This will search for the separator in the string, starting at the end. If
INADA Naoki3ae20562017-01-16 20:41:20 +090013202the separator is found, returns a 3-tuple containing the part before the
13203separator, the separator itself, and the part after it.
13204
13205If the separator is not found, returns a 3-tuple containing two empty strings
13206and the original string.
13207[clinic start generated code]*/
13208
13209static PyObject *
13210unicode_rpartition(PyObject *self, PyObject *sep)
Serhiy Storchakaa2314282017-10-29 02:11:54 +030013211/*[clinic end generated code: output=1aa13cf1156572aa input=c4b7db3ef5cf336a]*/
Thomas Wouters477c8d52006-05-27 19:21:47 +000013212{
INADA Naoki3ae20562017-01-16 20:41:20 +090013213 return PyUnicode_RPartition(self, sep);
Thomas Wouters477c8d52006-05-27 19:21:47 +000013214}
13215
Alexander Belopolsky40018472011-02-26 01:02:56 +000013216PyObject *
13217PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013218{
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013219 if (ensure_unicode(s) < 0 || (sep != NULL && ensure_unicode(sep) < 0))
Benjamin Peterson14339b62009-01-31 16:36:08 +000013220 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013221
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013222 return rsplit(s, sep, maxsplit);
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013223}
13224
INADA Naoki3ae20562017-01-16 20:41:20 +090013225/*[clinic input]
13226str.rsplit as unicode_rsplit = str.split
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013227
INADA Naoki3ae20562017-01-16 20:41:20 +090013228Return a list of the words in the string, using sep as the delimiter string.
13229
13230Splits are done starting at the end of the string and working to the front.
13231[clinic start generated code]*/
13232
13233static PyObject *
13234unicode_rsplit_impl(PyObject *self, PyObject *sep, Py_ssize_t maxsplit)
13235/*[clinic end generated code: output=c2b815c63bcabffc input=12ad4bf57dd35f15]*/
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013236{
INADA Naoki3ae20562017-01-16 20:41:20 +090013237 if (sep == Py_None)
13238 return rsplit(self, NULL, maxsplit);
13239 if (PyUnicode_Check(sep))
13240 return rsplit(self, sep, maxsplit);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013241
Victor Stinner998b8062018-09-12 00:23:25 +020013242 PyErr_Format(PyExc_TypeError,
13243 "must be str or None, not %.100s",
13244 Py_TYPE(sep)->tp_name);
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013245 return NULL;
Hye-Shik Chang3ae811b2003-12-15 18:49:53 +000013246}
13247
INADA Naoki3ae20562017-01-16 20:41:20 +090013248/*[clinic input]
13249str.splitlines as unicode_splitlines
Guido van Rossumd57fd912000-03-10 22:53:23 +000013250
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013251 keepends: bool(accept={int}) = False
INADA Naoki3ae20562017-01-16 20:41:20 +090013252
13253Return a list of the lines in the string, breaking at line boundaries.
13254
13255Line breaks are not included in the resulting list unless keepends is given and
13256true.
13257[clinic start generated code]*/
13258
13259static PyObject *
13260unicode_splitlines_impl(PyObject *self, int keepends)
Serhiy Storchaka202fda52017-03-12 10:10:47 +020013261/*[clinic end generated code: output=f664dcdad153ec40 input=b508e180459bdd8b]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013262{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013263 return PyUnicode_Splitlines(self, keepends);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013264}
13265
13266static
Guido van Rossumf15a29f2007-05-04 00:41:39 +000013267PyObject *unicode_str(PyObject *self)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013268{
Victor Stinnerc4b49542011-12-11 22:44:26 +010013269 return unicode_result_unchanged(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013270}
13271
INADA Naoki3ae20562017-01-16 20:41:20 +090013272/*[clinic input]
13273str.swapcase as unicode_swapcase
Guido van Rossumd57fd912000-03-10 22:53:23 +000013274
INADA Naoki3ae20562017-01-16 20:41:20 +090013275Convert uppercase characters to lowercase and lowercase characters to uppercase.
13276[clinic start generated code]*/
13277
13278static PyObject *
13279unicode_swapcase_impl(PyObject *self)
13280/*[clinic end generated code: output=5d28966bf6d7b2af input=3f3ef96d5798a7bb]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013281{
Benjamin Petersoneea48462012-01-16 14:28:50 -050013282 if (PyUnicode_READY(self) == -1)
13283 return NULL;
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013284 return case_operation(self, do_swapcase);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013285}
13286
Larry Hastings61272b72014-01-07 12:41:53 -080013287/*[clinic input]
Georg Brandlceee0772007-11-27 23:48:05 +000013288
Larry Hastings31826802013-10-19 00:09:25 -070013289@staticmethod
13290str.maketrans as unicode_maketrans
13291
13292 x: object
13293
13294 y: unicode=NULL
13295
13296 z: unicode=NULL
13297
13298 /
13299
13300Return a translation table usable for str.translate().
13301
13302If there is only one argument, it must be a dictionary mapping Unicode
13303ordinals (integers) or characters to Unicode ordinals, strings or None.
13304Character keys will be then converted to ordinals.
13305If there are two arguments, they must be strings of equal length, and
13306in the resulting dictionary, each character in x will be mapped to the
13307character at the same position in y. If there is a third argument, it
13308must be a string, whose characters will be mapped to None in the result.
Larry Hastings61272b72014-01-07 12:41:53 -080013309[clinic start generated code]*/
Larry Hastings31826802013-10-19 00:09:25 -070013310
Larry Hastings31826802013-10-19 00:09:25 -070013311static PyObject *
Larry Hastings5c661892014-01-24 06:17:25 -080013312unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
Serhiy Storchaka1009bf12015-04-03 23:53:51 +030013313/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
Larry Hastings31826802013-10-19 00:09:25 -070013314{
Georg Brandlceee0772007-11-27 23:48:05 +000013315 PyObject *new = NULL, *key, *value;
13316 Py_ssize_t i = 0;
13317 int res;
Benjamin Peterson14339b62009-01-31 16:36:08 +000013318
Georg Brandlceee0772007-11-27 23:48:05 +000013319 new = PyDict_New();
13320 if (!new)
13321 return NULL;
13322 if (y != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013323 int x_kind, y_kind, z_kind;
13324 void *x_data, *y_data, *z_data;
13325
Georg Brandlceee0772007-11-27 23:48:05 +000013326 /* x must be a string too, of equal length */
Georg Brandlceee0772007-11-27 23:48:05 +000013327 if (!PyUnicode_Check(x)) {
13328 PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13329 "be a string if there is a second argument");
13330 goto err;
13331 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013332 if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013333 PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13334 "arguments must have equal length");
13335 goto err;
13336 }
13337 /* create entries for translating chars in x to those in y */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013338 x_kind = PyUnicode_KIND(x);
13339 y_kind = PyUnicode_KIND(y);
13340 x_data = PyUnicode_DATA(x);
13341 y_data = PyUnicode_DATA(y);
13342 for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13343 key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013344 if (!key)
Georg Brandlceee0772007-11-27 23:48:05 +000013345 goto err;
Benjamin Peterson822c7902011-12-20 13:32:50 -060013346 value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
Benjamin Peterson53aa1d72011-12-20 13:29:45 -060013347 if (!value) {
13348 Py_DECREF(key);
13349 goto err;
13350 }
Georg Brandlceee0772007-11-27 23:48:05 +000013351 res = PyDict_SetItem(new, key, value);
13352 Py_DECREF(key);
13353 Py_DECREF(value);
13354 if (res < 0)
13355 goto err;
13356 }
13357 /* create entries for deleting chars in z */
13358 if (z != NULL) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013359 z_kind = PyUnicode_KIND(z);
13360 z_data = PyUnicode_DATA(z);
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013361 for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013362 key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
Georg Brandlceee0772007-11-27 23:48:05 +000013363 if (!key)
13364 goto err;
13365 res = PyDict_SetItem(new, key, Py_None);
13366 Py_DECREF(key);
13367 if (res < 0)
13368 goto err;
13369 }
13370 }
13371 } else {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013372 int kind;
13373 void *data;
13374
Georg Brandlceee0772007-11-27 23:48:05 +000013375 /* x must be a dict */
Raymond Hettinger3ad05762009-05-29 22:11:22 +000013376 if (!PyDict_CheckExact(x)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013377 PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13378 "to maketrans it must be a dict");
13379 goto err;
13380 }
13381 /* copy entries into the new dict, converting string keys to int keys */
13382 while (PyDict_Next(x, &i, &key, &value)) {
13383 if (PyUnicode_Check(key)) {
13384 /* convert string keys to integer keys */
13385 PyObject *newkey;
Victor Stinnerc4f281e2011-10-11 22:11:42 +020013386 if (PyUnicode_GET_LENGTH(key) != 1) {
Georg Brandlceee0772007-11-27 23:48:05 +000013387 PyErr_SetString(PyExc_ValueError, "string keys in translate "
13388 "table must be of length 1");
13389 goto err;
13390 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013391 kind = PyUnicode_KIND(key);
13392 data = PyUnicode_DATA(key);
13393 newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
Georg Brandlceee0772007-11-27 23:48:05 +000013394 if (!newkey)
13395 goto err;
13396 res = PyDict_SetItem(new, newkey, value);
13397 Py_DECREF(newkey);
13398 if (res < 0)
13399 goto err;
Christian Heimes217cfd12007-12-02 14:31:20 +000013400 } else if (PyLong_Check(key)) {
Georg Brandlceee0772007-11-27 23:48:05 +000013401 /* just keep integer keys */
13402 if (PyDict_SetItem(new, key, value) < 0)
13403 goto err;
13404 } else {
13405 PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13406 "be strings or integers");
13407 goto err;
13408 }
13409 }
13410 }
13411 return new;
13412 err:
13413 Py_DECREF(new);
13414 return NULL;
13415}
13416
INADA Naoki3ae20562017-01-16 20:41:20 +090013417/*[clinic input]
13418str.translate as unicode_translate
Guido van Rossumd57fd912000-03-10 22:53:23 +000013419
INADA Naoki3ae20562017-01-16 20:41:20 +090013420 table: object
13421 Translation table, which must be a mapping of Unicode ordinals to
13422 Unicode ordinals, strings, or None.
13423 /
13424
13425Replace each character in the string using the given translation table.
13426
13427The table must implement lookup/indexing via __getitem__, for instance a
13428dictionary or list. If this operation raises LookupError, the character is
13429left untouched. Characters mapped to None are deleted.
13430[clinic start generated code]*/
13431
13432static PyObject *
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013433unicode_translate(PyObject *self, PyObject *table)
INADA Naoki3ae20562017-01-16 20:41:20 +090013434/*[clinic end generated code: output=3cb448ff2fd96bf3 input=6d38343db63d8eb0]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013435{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013436 return _PyUnicode_TranslateCharmap(self, table, "ignore");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013437}
13438
INADA Naoki3ae20562017-01-16 20:41:20 +090013439/*[clinic input]
13440str.upper as unicode_upper
Guido van Rossumd57fd912000-03-10 22:53:23 +000013441
INADA Naoki3ae20562017-01-16 20:41:20 +090013442Return a copy of the string converted to uppercase.
13443[clinic start generated code]*/
13444
13445static PyObject *
13446unicode_upper_impl(PyObject *self)
13447/*[clinic end generated code: output=1b7ddd16bbcdc092 input=db3d55682dfe2e6c]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013448{
Benjamin Petersonb2bf01d2012-01-11 18:17:06 -050013449 if (PyUnicode_READY(self) == -1)
13450 return NULL;
13451 if (PyUnicode_IS_ASCII(self))
13452 return ascii_upper_or_lower(self, 0);
Victor Stinnerb0800dc2012-02-25 00:47:08 +010013453 return case_operation(self, do_upper);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013454}
13455
INADA Naoki3ae20562017-01-16 20:41:20 +090013456/*[clinic input]
13457str.zfill as unicode_zfill
13458
13459 width: Py_ssize_t
13460 /
13461
13462Pad a numeric string with zeros on the left, to fill a field of the given width.
13463
13464The string is never truncated.
13465[clinic start generated code]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013466
13467static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013468unicode_zfill_impl(PyObject *self, Py_ssize_t width)
INADA Naoki15f94592017-01-16 21:49:13 +090013469/*[clinic end generated code: output=e13fb6bdf8e3b9df input=c6b2f772c6f27799]*/
Guido van Rossumd57fd912000-03-10 22:53:23 +000013470{
Martin v. Löwis18e16552006-02-15 17:27:45 +000013471 Py_ssize_t fill;
Victor Stinner9310abb2011-10-05 00:59:23 +020013472 PyObject *u;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013473 int kind;
13474 void *data;
13475 Py_UCS4 chr;
13476
Benjamin Petersonbac79492012-01-14 13:34:47 -050013477 if (PyUnicode_READY(self) == -1)
Victor Stinnerc4b49542011-12-11 22:44:26 +010013478 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013479
Victor Stinnerc4b49542011-12-11 22:44:26 +010013480 if (PyUnicode_GET_LENGTH(self) >= width)
13481 return unicode_result_unchanged(self);
13482
13483 fill = width - PyUnicode_GET_LENGTH(self);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013484
13485 u = pad(self, fill, 0, '0');
13486
Walter Dörwald068325e2002-04-15 13:36:47 +000013487 if (u == NULL)
13488 return NULL;
13489
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013490 kind = PyUnicode_KIND(u);
13491 data = PyUnicode_DATA(u);
13492 chr = PyUnicode_READ(kind, data, fill);
13493
13494 if (chr == '+' || chr == '-') {
Guido van Rossumd57fd912000-03-10 22:53:23 +000013495 /* move sign to beginning of string */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013496 PyUnicode_WRITE(kind, data, 0, chr);
13497 PyUnicode_WRITE(kind, data, fill, '0');
Guido van Rossumd57fd912000-03-10 22:53:23 +000013498 }
13499
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020013500 assert(_PyUnicode_CheckConsistency(u, 1));
Victor Stinner7931d9a2011-11-04 00:22:48 +010013501 return u;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013502}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013503
13504#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013505static PyObject *
13506unicode__decimal2ascii(PyObject *self)
13507{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013508 return PyUnicode_TransformDecimalAndSpaceToASCII(self);
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000013509}
Guido van Rossumd57fd912000-03-10 22:53:23 +000013510#endif
13511
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013512PyDoc_STRVAR(startswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013513 "S.startswith(prefix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013514\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013515Return True if S starts with the specified prefix, False otherwise.\n\
13516With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013517With optional end, stop comparing S at that position.\n\
13518prefix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013519
13520static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013521unicode_startswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013522 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013523{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013524 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013525 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013526 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013527 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013528 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013529
Jesus Ceaac451502011-04-20 17:09:23 +020013530 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013531 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013532 if (PyTuple_Check(subobj)) {
13533 Py_ssize_t i;
13534 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013535 substring = PyTuple_GET_ITEM(subobj, i);
13536 if (!PyUnicode_Check(substring)) {
13537 PyErr_Format(PyExc_TypeError,
13538 "tuple for startswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013539 "not %.100s",
13540 Py_TYPE(substring)->tp_name);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013541 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013542 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013543 result = tailmatch(self, substring, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013544 if (result == -1)
13545 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013546 if (result) {
13547 Py_RETURN_TRUE;
13548 }
13549 }
13550 /* nothing matched */
13551 Py_RETURN_FALSE;
13552 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013553 if (!PyUnicode_Check(subobj)) {
13554 PyErr_Format(PyExc_TypeError,
13555 "startswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013556 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013557 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013558 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013559 result = tailmatch(self, subobj, start, end, -1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013560 if (result == -1)
13561 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013562 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013563}
13564
13565
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000013566PyDoc_STRVAR(endswith__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013567 "S.endswith(suffix[, start[, end]]) -> bool\n\
Guido van Rossumd57fd912000-03-10 22:53:23 +000013568\n\
Guido van Rossuma7132182003-04-09 19:32:45 +000013569Return True if S ends with the specified suffix, False otherwise.\n\
13570With optional start, test S beginning at that position.\n\
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013571With optional end, stop comparing S at that position.\n\
13572suffix can also be a tuple of strings to try.");
Guido van Rossumd57fd912000-03-10 22:53:23 +000013573
13574static PyObject *
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013575unicode_endswith(PyObject *self,
Benjamin Peterson29060642009-01-31 22:14:21 +000013576 PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000013577{
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013578 PyObject *subobj;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020013579 PyObject *substring;
Martin v. Löwis18e16552006-02-15 17:27:45 +000013580 Py_ssize_t start = 0;
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000013581 Py_ssize_t end = PY_SSIZE_T_MAX;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013582 int result;
Guido van Rossumd57fd912000-03-10 22:53:23 +000013583
Jesus Ceaac451502011-04-20 17:09:23 +020013584 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
Benjamin Peterson29060642009-01-31 22:14:21 +000013585 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013586 if (PyTuple_Check(subobj)) {
13587 Py_ssize_t i;
13588 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013589 substring = PyTuple_GET_ITEM(subobj, i);
13590 if (!PyUnicode_Check(substring)) {
13591 PyErr_Format(PyExc_TypeError,
13592 "tuple for endswith must only contain str, "
Victor Stinner998b8062018-09-12 00:23:25 +020013593 "not %.100s",
13594 Py_TYPE(substring)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013595 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013596 }
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013597 result = tailmatch(self, substring, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013598 if (result == -1)
13599 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013600 if (result) {
13601 Py_RETURN_TRUE;
13602 }
13603 }
13604 Py_RETURN_FALSE;
13605 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013606 if (!PyUnicode_Check(subobj)) {
13607 PyErr_Format(PyExc_TypeError,
13608 "endswith first arg must be str or "
Victor Stinner998b8062018-09-12 00:23:25 +020013609 "a tuple of str, not %.100s", Py_TYPE(subobj)->tp_name);
Benjamin Peterson29060642009-01-31 22:14:21 +000013610 return NULL;
Ezio Melottiba42fd52011-04-26 06:09:45 +030013611 }
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030013612 result = tailmatch(self, subobj, start, end, +1);
Victor Stinner18aa4472013-01-03 03:18:09 +010013613 if (result == -1)
13614 return NULL;
Thomas Wouters0e3f5912006-08-11 14:57:12 +000013615 return PyBool_FromLong(result);
Guido van Rossumd57fd912000-03-10 22:53:23 +000013616}
13617
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013618static inline void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013619_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013620{
Victor Stinnereb36fda2015-10-03 01:55:51 +020013621 writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13622 writer->data = PyUnicode_DATA(writer->buffer);
13623
13624 if (!writer->readonly) {
13625 writer->kind = PyUnicode_KIND(writer->buffer);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013626 writer->size = PyUnicode_GET_LENGTH(writer->buffer);
Victor Stinnereb36fda2015-10-03 01:55:51 +020013627 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013628 else {
Victor Stinnereb36fda2015-10-03 01:55:51 +020013629 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13630 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13631 writer->kind = PyUnicode_WCHAR_KIND;
13632 assert(writer->kind <= PyUnicode_1BYTE_KIND);
13633
Victor Stinner8f674cc2013-04-17 23:02:17 +020013634 /* Copy-on-write mode: set buffer size to 0 so
13635 * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13636 * next write. */
13637 writer->size = 0;
13638 }
Victor Stinner202fdca2012-05-07 12:47:02 +020013639}
13640
Victor Stinnerd3f08822012-05-29 12:57:52 +020013641void
Victor Stinner8f674cc2013-04-17 23:02:17 +020013642_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013643{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013644 memset(writer, 0, sizeof(*writer));
Victor Stinnereb36fda2015-10-03 01:55:51 +020013645
13646 /* ASCII is the bare minimum */
Victor Stinner8f674cc2013-04-17 23:02:17 +020013647 writer->min_char = 127;
Victor Stinnereb36fda2015-10-03 01:55:51 +020013648
13649 /* use a value smaller than PyUnicode_1BYTE_KIND() so
13650 _PyUnicodeWriter_PrepareKind() will copy the buffer. */
13651 writer->kind = PyUnicode_WCHAR_KIND;
13652 assert(writer->kind <= PyUnicode_1BYTE_KIND);
Victor Stinner202fdca2012-05-07 12:47:02 +020013653}
13654
Inada Naoki770847a2019-06-24 12:30:24 +090013655// Initialize _PyUnicodeWriter with initial buffer
13656static inline void
13657_PyUnicodeWriter_InitWithBuffer(_PyUnicodeWriter *writer, PyObject *buffer)
13658{
13659 memset(writer, 0, sizeof(*writer));
13660 writer->buffer = buffer;
13661 _PyUnicodeWriter_Update(writer);
13662 writer->min_length = writer->size;
13663}
13664
Victor Stinnerd3f08822012-05-29 12:57:52 +020013665int
13666_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13667 Py_ssize_t length, Py_UCS4 maxchar)
Victor Stinner202fdca2012-05-07 12:47:02 +020013668{
13669 Py_ssize_t newlen;
13670 PyObject *newbuffer;
13671
Victor Stinner2740e462016-09-06 16:58:36 -070013672 assert(maxchar <= MAX_UNICODE);
13673
Victor Stinnerca9381e2015-09-22 00:58:32 +020013674 /* ensure that the _PyUnicodeWriter_Prepare macro was used */
Victor Stinner61744742015-09-22 01:01:17 +020013675 assert((maxchar > writer->maxchar && length >= 0)
13676 || length > 0);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013677
Victor Stinner202fdca2012-05-07 12:47:02 +020013678 if (length > PY_SSIZE_T_MAX - writer->pos) {
13679 PyErr_NoMemory();
13680 return -1;
13681 }
13682 newlen = writer->pos + length;
13683
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070013684 maxchar = Py_MAX(maxchar, writer->min_char);
Victor Stinner8f674cc2013-04-17 23:02:17 +020013685
Victor Stinnerd3f08822012-05-29 12:57:52 +020013686 if (writer->buffer == NULL) {
Victor Stinner8f674cc2013-04-17 23:02:17 +020013687 assert(!writer->readonly);
Victor Stinner6989ba02013-11-18 21:08:39 +010013688 if (writer->overallocate
13689 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13690 /* overallocate to limit the number of realloc() */
13691 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013692 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013693 if (newlen < writer->min_length)
13694 newlen = writer->min_length;
13695
Victor Stinnerd3f08822012-05-29 12:57:52 +020013696 writer->buffer = PyUnicode_New(newlen, maxchar);
13697 if (writer->buffer == NULL)
13698 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013699 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013700 else if (newlen > writer->size) {
Victor Stinner6989ba02013-11-18 21:08:39 +010013701 if (writer->overallocate
13702 && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13703 /* overallocate to limit the number of realloc() */
13704 newlen += newlen / OVERALLOCATE_FACTOR;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013705 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013706 if (newlen < writer->min_length)
13707 newlen = writer->min_length;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013708
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013709 if (maxchar > writer->maxchar || writer->readonly) {
Victor Stinner202fdca2012-05-07 12:47:02 +020013710 /* resize + widen */
Serhiy Storchaka28b21e52015-10-02 13:07:28 +030013711 maxchar = Py_MAX(maxchar, writer->maxchar);
Victor Stinner202fdca2012-05-07 12:47:02 +020013712 newbuffer = PyUnicode_New(newlen, maxchar);
13713 if (newbuffer == NULL)
13714 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013715 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13716 writer->buffer, 0, writer->pos);
Victor Stinner202fdca2012-05-07 12:47:02 +020013717 Py_DECREF(writer->buffer);
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013718 writer->readonly = 0;
Victor Stinner202fdca2012-05-07 12:47:02 +020013719 }
13720 else {
13721 newbuffer = resize_compact(writer->buffer, newlen);
13722 if (newbuffer == NULL)
13723 return -1;
13724 }
13725 writer->buffer = newbuffer;
Victor Stinner202fdca2012-05-07 12:47:02 +020013726 }
13727 else if (maxchar > writer->maxchar) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013728 assert(!writer->readonly);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013729 newbuffer = PyUnicode_New(writer->size, maxchar);
13730 if (newbuffer == NULL)
Victor Stinner202fdca2012-05-07 12:47:02 +020013731 return -1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013732 _PyUnicode_FastCopyCharacters(newbuffer, 0,
13733 writer->buffer, 0, writer->pos);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030013734 Py_SETREF(writer->buffer, newbuffer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013735 }
Victor Stinner8f674cc2013-04-17 23:02:17 +020013736 _PyUnicodeWriter_Update(writer);
Victor Stinner202fdca2012-05-07 12:47:02 +020013737 return 0;
Victor Stinner6989ba02013-11-18 21:08:39 +010013738
13739#undef OVERALLOCATE_FACTOR
Victor Stinner202fdca2012-05-07 12:47:02 +020013740}
13741
Victor Stinnerca9381e2015-09-22 00:58:32 +020013742int
13743_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13744 enum PyUnicode_Kind kind)
13745{
13746 Py_UCS4 maxchar;
13747
13748 /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13749 assert(writer->kind < kind);
13750
13751 switch (kind)
13752 {
13753 case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13754 case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13755 case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13756 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013757 Py_UNREACHABLE();
Victor Stinnerca9381e2015-09-22 00:58:32 +020013758 }
13759
13760 return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13761}
13762
Benjamin Peterson2e7c5e92016-09-07 15:33:32 -070013763static inline int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013764_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
Victor Stinnera0dd0212013-04-11 22:09:04 +020013765{
Victor Stinner2740e462016-09-06 16:58:36 -070013766 assert(ch <= MAX_UNICODE);
Victor Stinnera0dd0212013-04-11 22:09:04 +020013767 if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13768 return -1;
13769 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13770 writer->pos++;
13771 return 0;
13772}
13773
13774int
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020013775_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13776{
13777 return _PyUnicodeWriter_WriteCharInline(writer, ch);
13778}
13779
13780int
Victor Stinnerd3f08822012-05-29 12:57:52 +020013781_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13782{
13783 Py_UCS4 maxchar;
13784 Py_ssize_t len;
13785
13786 if (PyUnicode_READY(str) == -1)
13787 return -1;
13788 len = PyUnicode_GET_LENGTH(str);
13789 if (len == 0)
13790 return 0;
13791 maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13792 if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013793 if (writer->buffer == NULL && !writer->overallocate) {
Victor Stinner1912b392015-03-26 09:37:23 +010013794 assert(_PyUnicode_CheckConsistency(str, 1));
Victor Stinner8f674cc2013-04-17 23:02:17 +020013795 writer->readonly = 1;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013796 Py_INCREF(str);
13797 writer->buffer = str;
13798 _PyUnicodeWriter_Update(writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013799 writer->pos += len;
13800 return 0;
13801 }
13802 if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13803 return -1;
13804 }
13805 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13806 str, 0, len);
13807 writer->pos += len;
13808 return 0;
13809}
13810
Victor Stinnere215d962012-10-06 23:03:36 +020013811int
Victor Stinnercfc4c132013-04-03 01:48:39 +020013812_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13813 Py_ssize_t start, Py_ssize_t end)
13814{
13815 Py_UCS4 maxchar;
13816 Py_ssize_t len;
13817
13818 if (PyUnicode_READY(str) == -1)
13819 return -1;
13820
13821 assert(0 <= start);
13822 assert(end <= PyUnicode_GET_LENGTH(str));
13823 assert(start <= end);
13824
13825 if (end == 0)
13826 return 0;
13827
13828 if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13829 return _PyUnicodeWriter_WriteStr(writer, str);
13830
13831 if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13832 maxchar = _PyUnicode_FindMaxChar(str, start, end);
13833 else
13834 maxchar = writer->maxchar;
13835 len = end - start;
13836
13837 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13838 return -1;
13839
13840 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13841 str, start, len);
13842 writer->pos += len;
13843 return 0;
13844}
13845
13846int
Victor Stinner4a587072013-11-19 12:54:53 +010013847_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13848 const char *ascii, Py_ssize_t len)
13849{
13850 if (len == -1)
13851 len = strlen(ascii);
13852
Andy Lestere6be9b52020-02-11 20:28:35 -060013853 assert(ucs1lib_find_max_char((const Py_UCS1*)ascii, (const Py_UCS1*)ascii + len) < 128);
Victor Stinner4a587072013-11-19 12:54:53 +010013854
13855 if (writer->buffer == NULL && !writer->overallocate) {
13856 PyObject *str;
13857
13858 str = _PyUnicode_FromASCII(ascii, len);
13859 if (str == NULL)
13860 return -1;
13861
13862 writer->readonly = 1;
13863 writer->buffer = str;
13864 _PyUnicodeWriter_Update(writer);
13865 writer->pos += len;
13866 return 0;
13867 }
13868
13869 if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13870 return -1;
13871
13872 switch (writer->kind)
13873 {
13874 case PyUnicode_1BYTE_KIND:
13875 {
13876 const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13877 Py_UCS1 *data = writer->data;
13878
Christian Heimesf051e432016-09-13 20:22:02 +020013879 memcpy(data + writer->pos, str, len);
Victor Stinner4a587072013-11-19 12:54:53 +010013880 break;
13881 }
13882 case PyUnicode_2BYTE_KIND:
13883 {
13884 _PyUnicode_CONVERT_BYTES(
13885 Py_UCS1, Py_UCS2,
13886 ascii, ascii + len,
13887 (Py_UCS2 *)writer->data + writer->pos);
13888 break;
13889 }
13890 case PyUnicode_4BYTE_KIND:
13891 {
13892 _PyUnicode_CONVERT_BYTES(
13893 Py_UCS1, Py_UCS4,
13894 ascii, ascii + len,
13895 (Py_UCS4 *)writer->data + writer->pos);
13896 break;
13897 }
13898 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070013899 Py_UNREACHABLE();
Victor Stinner4a587072013-11-19 12:54:53 +010013900 }
13901
13902 writer->pos += len;
13903 return 0;
13904}
13905
13906int
13907_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13908 const char *str, Py_ssize_t len)
Victor Stinnere215d962012-10-06 23:03:36 +020013909{
13910 Py_UCS4 maxchar;
13911
Andy Lestere6be9b52020-02-11 20:28:35 -060013912 maxchar = ucs1lib_find_max_char((const Py_UCS1*)str, (const Py_UCS1*)str + len);
Victor Stinnere215d962012-10-06 23:03:36 +020013913 if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13914 return -1;
13915 unicode_write_cstr(writer->buffer, writer->pos, str, len);
13916 writer->pos += len;
13917 return 0;
13918}
13919
Victor Stinnerd3f08822012-05-29 12:57:52 +020013920PyObject *
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013921_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013922{
Victor Stinner15a0bd32013-07-08 22:29:55 +020013923 PyObject *str;
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013924
Victor Stinnerd3f08822012-05-29 12:57:52 +020013925 if (writer->pos == 0) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013926 Py_CLEAR(writer->buffer);
Serhiy Storchaka678db842013-01-26 12:16:36 +020013927 _Py_RETURN_UNICODE_EMPTY();
Victor Stinnerd3f08822012-05-29 12:57:52 +020013928 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013929
13930 str = writer->buffer;
13931 writer->buffer = NULL;
13932
Victor Stinnerd7b7c742012-06-04 22:52:12 +020013933 if (writer->readonly) {
Victor Stinner9e6b4d72013-07-09 00:37:24 +020013934 assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13935 return str;
Victor Stinnerd3f08822012-05-29 12:57:52 +020013936 }
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013937
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013938 if (PyUnicode_GET_LENGTH(str) != writer->pos) {
13939 PyObject *str2;
13940 str2 = resize_compact(str, writer->pos);
13941 if (str2 == NULL) {
13942 Py_DECREF(str);
13943 return NULL;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013944 }
Serhiy Storchakac8bc3d12016-10-25 13:23:56 +030013945 str = str2;
Victor Stinner6c2cdae2015-10-12 13:29:43 +020013946 }
13947
Victor Stinner15a0bd32013-07-08 22:29:55 +020013948 assert(_PyUnicode_CheckConsistency(str, 1));
13949 return unicode_result_ready(str);
Victor Stinner202fdca2012-05-07 12:47:02 +020013950}
13951
Victor Stinnerd3f08822012-05-29 12:57:52 +020013952void
Victor Stinner3b1a74a2012-05-09 22:25:00 +020013953_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
Victor Stinner202fdca2012-05-07 12:47:02 +020013954{
13955 Py_CLEAR(writer->buffer);
13956}
13957
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020013958#include "stringlib/unicode_format.h"
Eric Smith8c663262007-08-25 02:26:07 +000013959
13960PyDoc_STRVAR(format__doc__,
Benjamin Peterson29060642009-01-31 22:14:21 +000013961 "S.format(*args, **kwargs) -> str\n\
Eric Smith8c663262007-08-25 02:26:07 +000013962\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013963Return a formatted version of S, using substitutions from args and kwargs.\n\
13964The substitutions are identified by braces ('{' and '}').");
Eric Smith8c663262007-08-25 02:26:07 +000013965
Eric Smith27bbca62010-11-04 17:06:58 +000013966PyDoc_STRVAR(format_map__doc__,
13967 "S.format_map(mapping) -> str\n\
13968\n\
Eric Smith51d2fd92010-11-06 19:27:37 +000013969Return a formatted version of S, using substitutions from mapping.\n\
13970The substitutions are identified by braces ('{' and '}').");
Eric Smith27bbca62010-11-04 17:06:58 +000013971
INADA Naoki3ae20562017-01-16 20:41:20 +090013972/*[clinic input]
13973str.__format__ as unicode___format__
13974
13975 format_spec: unicode
13976 /
13977
13978Return a formatted version of the string as described by format_spec.
13979[clinic start generated code]*/
13980
Eric Smith4a7d76d2008-05-30 18:10:19 +000013981static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090013982unicode___format___impl(PyObject *self, PyObject *format_spec)
INADA Naoki15f94592017-01-16 21:49:13 +090013983/*[clinic end generated code: output=45fceaca6d2ba4c8 input=5e135645d167a214]*/
Eric Smith4a7d76d2008-05-30 18:10:19 +000013984{
Victor Stinnerd3f08822012-05-29 12:57:52 +020013985 _PyUnicodeWriter writer;
13986 int ret;
Eric Smith4a7d76d2008-05-30 18:10:19 +000013987
Victor Stinnerd3f08822012-05-29 12:57:52 +020013988 if (PyUnicode_READY(self) == -1)
13989 return NULL;
Victor Stinner8f674cc2013-04-17 23:02:17 +020013990 _PyUnicodeWriter_Init(&writer);
Victor Stinnerd3f08822012-05-29 12:57:52 +020013991 ret = _PyUnicode_FormatAdvancedWriter(&writer,
13992 self, format_spec, 0,
13993 PyUnicode_GET_LENGTH(format_spec));
13994 if (ret == -1) {
13995 _PyUnicodeWriter_Dealloc(&writer);
13996 return NULL;
13997 }
13998 return _PyUnicodeWriter_Finish(&writer);
Eric Smith4a7d76d2008-05-30 18:10:19 +000013999}
14000
INADA Naoki3ae20562017-01-16 20:41:20 +090014001/*[clinic input]
14002str.__sizeof__ as unicode_sizeof
14003
14004Return the size of the string in memory, in bytes.
14005[clinic start generated code]*/
Eric Smith8c663262007-08-25 02:26:07 +000014006
14007static PyObject *
INADA Naoki3ae20562017-01-16 20:41:20 +090014008unicode_sizeof_impl(PyObject *self)
14009/*[clinic end generated code: output=6dbc2f5a408b6d4f input=6dd011c108e33fb0]*/
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014010{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014011 Py_ssize_t size;
14012
14013 /* If it's a compact object, account for base structure +
14014 character data. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014015 if (PyUnicode_IS_COMPACT_ASCII(self))
14016 size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(self) + 1;
14017 else if (PyUnicode_IS_COMPACT(self))
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014018 size = sizeof(PyCompactUnicodeObject) +
INADA Naoki3ae20562017-01-16 20:41:20 +090014019 (PyUnicode_GET_LENGTH(self) + 1) * PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014020 else {
14021 /* If it is a two-block object, account for base object, and
14022 for character block if present. */
14023 size = sizeof(PyUnicodeObject);
INADA Naoki3ae20562017-01-16 20:41:20 +090014024 if (_PyUnicode_DATA_ANY(self))
14025 size += (PyUnicode_GET_LENGTH(self) + 1) *
14026 PyUnicode_KIND(self);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014027 }
14028 /* If the wstr pointer is present, account for it unless it is shared
Victor Stinnera3be6132011-10-03 02:16:37 +020014029 with the data pointer. Check if the data is not shared. */
INADA Naoki3ae20562017-01-16 20:41:20 +090014030 if (_PyUnicode_HAS_WSTR_MEMORY(self))
14031 size += (PyUnicode_WSTR_LENGTH(self) + 1) * sizeof(wchar_t);
14032 if (_PyUnicode_HAS_UTF8_MEMORY(self))
14033 size += PyUnicode_UTF8_LENGTH(self) + 1;
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014034
14035 return PyLong_FromSsize_t(size);
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014036}
14037
Georg Brandlc28e1fa2008-06-10 19:20:26 +000014038static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014039unicode_getnewargs(PyObject *v, PyObject *Py_UNUSED(ignored))
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014040{
Victor Stinnerbf6e5602011-12-12 01:53:47 +010014041 PyObject *copy = _PyUnicode_Copy(v);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014042 if (!copy)
14043 return NULL;
14044 return Py_BuildValue("(N)", copy);
Guido van Rossum5d9113d2003-01-29 17:58:45 +000014045}
14046
Guido van Rossumd57fd912000-03-10 22:53:23 +000014047static PyMethodDef unicode_methods[] = {
INADA Naoki3ae20562017-01-16 20:41:20 +090014048 UNICODE_ENCODE_METHODDEF
14049 UNICODE_REPLACE_METHODDEF
14050 UNICODE_SPLIT_METHODDEF
14051 UNICODE_RSPLIT_METHODDEF
14052 UNICODE_JOIN_METHODDEF
14053 UNICODE_CAPITALIZE_METHODDEF
14054 UNICODE_CASEFOLD_METHODDEF
14055 UNICODE_TITLE_METHODDEF
14056 UNICODE_CENTER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014057 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014058 UNICODE_EXPANDTABS_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014059 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014060 UNICODE_PARTITION_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014061 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014062 UNICODE_LJUST_METHODDEF
14063 UNICODE_LOWER_METHODDEF
14064 UNICODE_LSTRIP_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014065 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
14066 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014067 UNICODE_RJUST_METHODDEF
14068 UNICODE_RSTRIP_METHODDEF
14069 UNICODE_RPARTITION_METHODDEF
14070 UNICODE_SPLITLINES_METHODDEF
14071 UNICODE_STRIP_METHODDEF
14072 UNICODE_SWAPCASE_METHODDEF
14073 UNICODE_TRANSLATE_METHODDEF
14074 UNICODE_UPPER_METHODDEF
Martin v. Löwise3eb1f22001-08-16 13:15:00 +000014075 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
14076 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
INADA Naokia49ac992018-01-27 14:06:21 +090014077 UNICODE_ISASCII_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014078 UNICODE_ISLOWER_METHODDEF
14079 UNICODE_ISUPPER_METHODDEF
14080 UNICODE_ISTITLE_METHODDEF
14081 UNICODE_ISSPACE_METHODDEF
14082 UNICODE_ISDECIMAL_METHODDEF
14083 UNICODE_ISDIGIT_METHODDEF
14084 UNICODE_ISNUMERIC_METHODDEF
14085 UNICODE_ISALPHA_METHODDEF
14086 UNICODE_ISALNUM_METHODDEF
14087 UNICODE_ISIDENTIFIER_METHODDEF
14088 UNICODE_ISPRINTABLE_METHODDEF
14089 UNICODE_ZFILL_METHODDEF
Serhiy Storchaka62be7422018-11-27 13:27:31 +020014090 {"format", (PyCFunction)(void(*)(void)) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
Eric Smith27bbca62010-11-04 17:06:58 +000014091 {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
INADA Naoki3ae20562017-01-16 20:41:20 +090014092 UNICODE___FORMAT___METHODDEF
Larry Hastings31826802013-10-19 00:09:25 -070014093 UNICODE_MAKETRANS_METHODDEF
INADA Naoki3ae20562017-01-16 20:41:20 +090014094 UNICODE_SIZEOF_METHODDEF
Walter Dörwald068325e2002-04-15 13:36:47 +000014095#if 0
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014096 /* These methods are just used for debugging the implementation. */
Alexander Belopolsky942af5a2010-12-04 03:38:46 +000014097 {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014098#endif
14099
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053014100 {"__getnewargs__", unicode_getnewargs, METH_NOARGS},
Guido van Rossumd57fd912000-03-10 22:53:23 +000014101 {NULL, NULL}
14102};
14103
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014104static PyObject *
14105unicode_mod(PyObject *v, PyObject *w)
14106{
Brian Curtindfc80e32011-08-10 20:28:54 -050014107 if (!PyUnicode_Check(v))
14108 Py_RETURN_NOTIMPLEMENTED;
Benjamin Peterson29060642009-01-31 22:14:21 +000014109 return PyUnicode_Format(v, w);
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014110}
14111
14112static PyNumberMethods unicode_as_number = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014113 0, /*nb_add*/
14114 0, /*nb_subtract*/
14115 0, /*nb_multiply*/
14116 unicode_mod, /*nb_remainder*/
Neil Schemenauerce30bc92002-11-18 16:10:18 +000014117};
14118
Guido van Rossumd57fd912000-03-10 22:53:23 +000014119static PySequenceMethods unicode_as_sequence = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014120 (lenfunc) unicode_length, /* sq_length */
14121 PyUnicode_Concat, /* sq_concat */
14122 (ssizeargfunc) unicode_repeat, /* sq_repeat */
14123 (ssizeargfunc) unicode_getitem, /* sq_item */
14124 0, /* sq_slice */
14125 0, /* sq_ass_item */
14126 0, /* sq_ass_slice */
14127 PyUnicode_Contains, /* sq_contains */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014128};
14129
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014130static PyObject*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014131unicode_subscript(PyObject* self, PyObject* item)
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014132{
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014133 if (PyUnicode_READY(self) == -1)
14134 return NULL;
14135
Victor Stinnera15e2602020-04-08 02:01:56 +020014136 if (_PyIndex_Check(item)) {
Thomas Wouters00ee7ba2006-08-21 19:07:27 +000014137 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014138 if (i == -1 && PyErr_Occurred())
14139 return NULL;
14140 if (i < 0)
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014141 i += PyUnicode_GET_LENGTH(self);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014142 return unicode_getitem(self, i);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014143 } else if (PySlice_Check(item)) {
Zackery Spytz14514d92019-05-17 01:13:03 -060014144 Py_ssize_t start, stop, step, slicelength, i;
14145 size_t cur;
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014146 PyObject *result;
14147 void *src_data, *dest_data;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014148 int src_kind, dest_kind;
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014149 Py_UCS4 ch, max_char, kind_limit;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014150
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014151 if (PySlice_Unpack(item, &start, &stop, &step) < 0) {
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014152 return NULL;
14153 }
Serhiy Storchakab879fe82017-04-08 09:53:51 +030014154 slicelength = PySlice_AdjustIndices(PyUnicode_GET_LENGTH(self),
14155 &start, &stop, step);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014156
14157 if (slicelength <= 0) {
Serhiy Storchaka678db842013-01-26 12:16:36 +020014158 _Py_RETURN_UNICODE_EMPTY();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014159 } else if (start == 0 && step == 1 &&
Victor Stinnerc4b49542011-12-11 22:44:26 +010014160 slicelength == PyUnicode_GET_LENGTH(self)) {
14161 return unicode_result_unchanged(self);
Thomas Woutersed03b412007-08-28 21:37:11 +000014162 } else if (step == 1) {
Victor Stinner9db1a8b2011-10-23 20:04:37 +020014163 return PyUnicode_Substring(self,
Victor Stinner12bab6d2011-10-01 01:53:49 +020014164 start, start + slicelength);
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014165 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014166 /* General case */
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014167 src_kind = PyUnicode_KIND(self);
14168 src_data = PyUnicode_DATA(self);
Victor Stinner55c99112011-10-13 01:17:06 +020014169 if (!PyUnicode_IS_ASCII(self)) {
14170 kind_limit = kind_maxchar_limit(src_kind);
14171 max_char = 0;
14172 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
14173 ch = PyUnicode_READ(src_kind, src_data, cur);
14174 if (ch > max_char) {
14175 max_char = ch;
14176 if (max_char >= kind_limit)
14177 break;
14178 }
Victor Stinnerc80d6d22011-10-05 14:13:28 +020014179 }
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014180 }
Victor Stinner55c99112011-10-13 01:17:06 +020014181 else
14182 max_char = 127;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014183 result = PyUnicode_New(slicelength, max_char);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014184 if (result == NULL)
14185 return NULL;
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014186 dest_kind = PyUnicode_KIND(result);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014187 dest_data = PyUnicode_DATA(result);
14188
14189 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
Antoine Pitrou875f29b2011-10-04 20:00:49 +020014190 Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
14191 PyUnicode_WRITE(dest_kind, dest_data, i, ch);
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014192 }
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020014193 assert(_PyUnicode_CheckConsistency(result, 1));
Antoine Pitrou7aec4012011-10-04 19:08:01 +020014194 return result;
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014195 } else {
14196 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
14197 return NULL;
14198 }
14199}
14200
14201static PyMappingMethods unicode_as_mapping = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000014202 (lenfunc)unicode_length, /* mp_length */
14203 (binaryfunc)unicode_subscript, /* mp_subscript */
14204 (objobjargproc)0, /* mp_ass_subscript */
Michael W. Hudson5efaf7e2002-06-11 10:55:12 +000014205};
14206
Guido van Rossumd57fd912000-03-10 22:53:23 +000014207
Guido van Rossumd57fd912000-03-10 22:53:23 +000014208/* Helpers for PyUnicode_Format() */
14209
Victor Stinnera47082312012-10-04 02:19:54 +020014210struct unicode_formatter_t {
14211 PyObject *args;
14212 int args_owned;
14213 Py_ssize_t arglen, argidx;
14214 PyObject *dict;
14215
14216 enum PyUnicode_Kind fmtkind;
14217 Py_ssize_t fmtcnt, fmtpos;
14218 void *fmtdata;
14219 PyObject *fmtstr;
14220
14221 _PyUnicodeWriter writer;
14222};
14223
14224struct unicode_format_arg_t {
14225 Py_UCS4 ch;
14226 int flags;
14227 Py_ssize_t width;
14228 int prec;
14229 int sign;
14230};
14231
Guido van Rossumd57fd912000-03-10 22:53:23 +000014232static PyObject *
Victor Stinnera47082312012-10-04 02:19:54 +020014233unicode_format_getnextarg(struct unicode_formatter_t *ctx)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014234{
Victor Stinnera47082312012-10-04 02:19:54 +020014235 Py_ssize_t argidx = ctx->argidx;
14236
14237 if (argidx < ctx->arglen) {
14238 ctx->argidx++;
14239 if (ctx->arglen < 0)
14240 return ctx->args;
Benjamin Peterson29060642009-01-31 22:14:21 +000014241 else
Victor Stinnera47082312012-10-04 02:19:54 +020014242 return PyTuple_GetItem(ctx->args, argidx);
Guido van Rossumd57fd912000-03-10 22:53:23 +000014243 }
14244 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014245 "not enough arguments for format string");
Guido van Rossumd57fd912000-03-10 22:53:23 +000014246 return NULL;
14247}
14248
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014249/* Returns a new reference to a PyUnicode object, or NULL on failure. */
Guido van Rossumd57fd912000-03-10 22:53:23 +000014250
Victor Stinnera47082312012-10-04 02:19:54 +020014251/* Format a float into the writer if the writer is not NULL, or into *p_output
14252 otherwise.
14253
14254 Return 0 on success, raise an exception and return -1 on error. */
Victor Stinnerd3f08822012-05-29 12:57:52 +020014255static int
Victor Stinnera47082312012-10-04 02:19:54 +020014256formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
14257 PyObject **p_output,
14258 _PyUnicodeWriter *writer)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014259{
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014260 char *p;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014261 double x;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014262 Py_ssize_t len;
Victor Stinnera47082312012-10-04 02:19:54 +020014263 int prec;
14264 int dtoa_flags;
Tim Petersced69f82003-09-16 20:30:58 +000014265
Guido van Rossumd57fd912000-03-10 22:53:23 +000014266 x = PyFloat_AsDouble(v);
14267 if (x == -1.0 && PyErr_Occurred())
Victor Stinnerd3f08822012-05-29 12:57:52 +020014268 return -1;
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014269
Victor Stinnera47082312012-10-04 02:19:54 +020014270 prec = arg->prec;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014271 if (prec < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000014272 prec = 6;
Eric Smith0923d1d2009-04-16 20:16:10 +000014273
Victor Stinnera47082312012-10-04 02:19:54 +020014274 if (arg->flags & F_ALT)
14275 dtoa_flags = Py_DTSF_ALT;
14276 else
14277 dtoa_flags = 0;
14278 p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
Mark Dickinsonf489caf2009-05-01 11:42:00 +000014279 if (p == NULL)
Victor Stinnerd3f08822012-05-29 12:57:52 +020014280 return -1;
14281 len = strlen(p);
14282 if (writer) {
Victor Stinner4a587072013-11-19 12:54:53 +010014283 if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
Christian Heimesf4f99392012-09-10 11:48:41 +020014284 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014285 return -1;
Christian Heimesf4f99392012-09-10 11:48:41 +020014286 }
Victor Stinnerd3f08822012-05-29 12:57:52 +020014287 }
14288 else
14289 *p_output = _PyUnicode_FromASCII(p, len);
Eric Smith0923d1d2009-04-16 20:16:10 +000014290 PyMem_Free(p);
Victor Stinnerd3f08822012-05-29 12:57:52 +020014291 return 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014292}
14293
Victor Stinnerd0880d52012-04-27 23:40:13 +020014294/* formatlong() emulates the format codes d, u, o, x and X, and
14295 * the F_ALT flag, for Python's long (unbounded) ints. It's not used for
14296 * Python's regular ints.
14297 * Return value: a new PyUnicodeObject*, or NULL if error.
14298 * The output string is of the form
14299 * "-"? ("0x" | "0X")? digit+
14300 * "0x"/"0X" are present only for x and X conversions, with F_ALT
14301 * set in flags. The case of hex digits will be correct,
14302 * There will be at least prec digits, zero-filled on the left if
14303 * necessary to get that many.
14304 * val object to be converted
14305 * flags bitmask of format flags; only F_ALT is looked at
14306 * prec minimum number of digits; 0-fill on left if needed
14307 * type a character in [duoxX]; u acts the same as d
14308 *
14309 * CAUTION: o, x and X conversions on regular ints can never
14310 * produce a '-' sign, but can for Python's unbounded ints.
14311 */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014312PyObject *
14313_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
Tim Peters38fd5b62000-09-21 05:43:11 +000014314{
Victor Stinnerd0880d52012-04-27 23:40:13 +020014315 PyObject *result = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014316 char *buf;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014317 Py_ssize_t i;
14318 int sign; /* 1 if '-', else 0 */
14319 int len; /* number of characters */
14320 Py_ssize_t llen;
14321 int numdigits; /* len == numnondigits + numdigits */
14322 int numnondigits = 0;
Tim Peters38fd5b62000-09-21 05:43:11 +000014323
Victor Stinnerd0880d52012-04-27 23:40:13 +020014324 /* Avoid exceeding SSIZE_T_MAX */
14325 if (prec > INT_MAX-3) {
14326 PyErr_SetString(PyExc_OverflowError,
14327 "precision too large");
Benjamin Peterson14339b62009-01-31 16:36:08 +000014328 return NULL;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014329 }
14330
14331 assert(PyLong_Check(val));
14332
14333 switch (type) {
Victor Stinner621ef3d2012-10-02 00:33:47 +020014334 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014335 Py_UNREACHABLE();
Victor Stinnerd0880d52012-04-27 23:40:13 +020014336 case 'd':
Victor Stinner621ef3d2012-10-02 00:33:47 +020014337 case 'i':
Victor Stinnerd0880d52012-04-27 23:40:13 +020014338 case 'u':
Ethan Furmanfb137212013-08-31 10:18:55 -070014339 /* int and int subclasses should print numerically when a numeric */
14340 /* format code is used (see issue18780) */
14341 result = PyNumber_ToBase(val, 10);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014342 break;
14343 case 'o':
14344 numnondigits = 2;
14345 result = PyNumber_ToBase(val, 8);
14346 break;
14347 case 'x':
14348 case 'X':
14349 numnondigits = 2;
14350 result = PyNumber_ToBase(val, 16);
14351 break;
Victor Stinnerd0880d52012-04-27 23:40:13 +020014352 }
14353 if (!result)
14354 return NULL;
14355
14356 assert(unicode_modifiable(result));
14357 assert(PyUnicode_IS_READY(result));
14358 assert(PyUnicode_IS_ASCII(result));
14359
14360 /* To modify the string in-place, there can only be one reference. */
14361 if (Py_REFCNT(result) != 1) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014362 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014363 PyErr_BadInternalCall();
14364 return NULL;
14365 }
14366 buf = PyUnicode_DATA(result);
14367 llen = PyUnicode_GET_LENGTH(result);
14368 if (llen > INT_MAX) {
Christian Heimesd47802e2013-06-29 21:33:36 +020014369 Py_DECREF(result);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014370 PyErr_SetString(PyExc_ValueError,
Ethan Furmanb95b5612015-01-23 20:05:18 -080014371 "string too large in _PyUnicode_FormatLong");
Victor Stinnerd0880d52012-04-27 23:40:13 +020014372 return NULL;
14373 }
14374 len = (int)llen;
14375 sign = buf[0] == '-';
14376 numnondigits += sign;
14377 numdigits = len - numnondigits;
14378 assert(numdigits > 0);
14379
14380 /* Get rid of base marker unless F_ALT */
Ethan Furmanb95b5612015-01-23 20:05:18 -080014381 if (((alt) == 0 &&
Victor Stinnerd0880d52012-04-27 23:40:13 +020014382 (type == 'o' || type == 'x' || type == 'X'))) {
14383 assert(buf[sign] == '0');
14384 assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14385 buf[sign+1] == 'o');
14386 numnondigits -= 2;
14387 buf += 2;
14388 len -= 2;
14389 if (sign)
14390 buf[0] = '-';
14391 assert(len == numnondigits + numdigits);
14392 assert(numdigits > 0);
14393 }
14394
14395 /* Fill with leading zeroes to meet minimum width. */
14396 if (prec > numdigits) {
14397 PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14398 numnondigits + prec);
14399 char *b1;
14400 if (!r1) {
14401 Py_DECREF(result);
14402 return NULL;
14403 }
14404 b1 = PyBytes_AS_STRING(r1);
14405 for (i = 0; i < numnondigits; ++i)
14406 *b1++ = *buf++;
14407 for (i = 0; i < prec - numdigits; i++)
14408 *b1++ = '0';
14409 for (i = 0; i < numdigits; i++)
14410 *b1++ = *buf++;
14411 *b1 = '\0';
14412 Py_DECREF(result);
14413 result = r1;
14414 buf = PyBytes_AS_STRING(result);
14415 len = numnondigits + prec;
14416 }
14417
14418 /* Fix up case for hex conversions. */
14419 if (type == 'X') {
14420 /* Need to convert all lower case letters to upper case.
14421 and need to convert 0x to 0X (and -0x to -0X). */
14422 for (i = 0; i < len; i++)
14423 if (buf[i] >= 'a' && buf[i] <= 'x')
14424 buf[i] -= 'a'-'A';
14425 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014426 if (!PyUnicode_Check(result)
14427 || buf != PyUnicode_DATA(result)) {
Victor Stinnerd0880d52012-04-27 23:40:13 +020014428 PyObject *unicode;
Victor Stinnerd3f08822012-05-29 12:57:52 +020014429 unicode = _PyUnicode_FromASCII(buf, len);
Victor Stinnerd0880d52012-04-27 23:40:13 +020014430 Py_DECREF(result);
14431 result = unicode;
14432 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014433 else if (len != PyUnicode_GET_LENGTH(result)) {
14434 if (PyUnicode_Resize(&result, len) < 0)
14435 Py_CLEAR(result);
14436 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000014437 return result;
Tim Peters38fd5b62000-09-21 05:43:11 +000014438}
14439
Ethan Furmandf3ed242014-01-05 06:50:30 -080014440/* Format an integer or a float as an integer.
Victor Stinner621ef3d2012-10-02 00:33:47 +020014441 * Return 1 if the number has been formatted into the writer,
Victor Stinnera47082312012-10-04 02:19:54 +020014442 * 0 if the number has been formatted into *p_output
Victor Stinner621ef3d2012-10-02 00:33:47 +020014443 * -1 and raise an exception on error */
14444static int
Victor Stinnera47082312012-10-04 02:19:54 +020014445mainformatlong(PyObject *v,
14446 struct unicode_format_arg_t *arg,
14447 PyObject **p_output,
14448 _PyUnicodeWriter *writer)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014449{
14450 PyObject *iobj, *res;
Victor Stinnera47082312012-10-04 02:19:54 +020014451 char type = (char)arg->ch;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014452
14453 if (!PyNumber_Check(v))
14454 goto wrongtype;
14455
Ethan Furman9ab74802014-03-21 06:38:46 -070014456 /* make sure number is a type of integer for o, x, and X */
Victor Stinner621ef3d2012-10-02 00:33:47 +020014457 if (!PyLong_Check(v)) {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014458 if (type == 'o' || type == 'x' || type == 'X') {
14459 iobj = PyNumber_Index(v);
14460 if (iobj == NULL) {
Ethan Furman9ab74802014-03-21 06:38:46 -070014461 if (PyErr_ExceptionMatches(PyExc_TypeError))
14462 goto wrongtype;
Ethan Furman38d872e2014-03-19 08:38:52 -070014463 return -1;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014464 }
14465 }
14466 else {
14467 iobj = PyNumber_Long(v);
14468 if (iobj == NULL ) {
14469 if (PyErr_ExceptionMatches(PyExc_TypeError))
14470 goto wrongtype;
14471 return -1;
14472 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014473 }
14474 assert(PyLong_Check(iobj));
14475 }
14476 else {
14477 iobj = v;
14478 Py_INCREF(iobj);
14479 }
14480
14481 if (PyLong_CheckExact(v)
Victor Stinnera47082312012-10-04 02:19:54 +020014482 && arg->width == -1 && arg->prec == -1
14483 && !(arg->flags & (F_SIGN | F_BLANK))
14484 && type != 'X')
Victor Stinner621ef3d2012-10-02 00:33:47 +020014485 {
14486 /* Fast path */
Victor Stinnera47082312012-10-04 02:19:54 +020014487 int alternate = arg->flags & F_ALT;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014488 int base;
14489
Victor Stinnera47082312012-10-04 02:19:54 +020014490 switch(type)
Victor Stinner621ef3d2012-10-02 00:33:47 +020014491 {
14492 default:
Barry Warsawb2e57942017-09-14 18:13:16 -070014493 Py_UNREACHABLE();
Victor Stinner621ef3d2012-10-02 00:33:47 +020014494 case 'd':
14495 case 'i':
14496 case 'u':
14497 base = 10;
14498 break;
14499 case 'o':
14500 base = 8;
14501 break;
14502 case 'x':
14503 case 'X':
14504 base = 16;
14505 break;
14506 }
14507
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014508 if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14509 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014510 return -1;
Victor Stinnerc89d28f2012-10-02 12:54:07 +020014511 }
14512 Py_DECREF(iobj);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014513 return 1;
14514 }
14515
Ethan Furmanb95b5612015-01-23 20:05:18 -080014516 res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
Victor Stinner621ef3d2012-10-02 00:33:47 +020014517 Py_DECREF(iobj);
14518 if (res == NULL)
14519 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014520 *p_output = res;
Victor Stinner621ef3d2012-10-02 00:33:47 +020014521 return 0;
14522
14523wrongtype:
Ethan Furman9ab74802014-03-21 06:38:46 -070014524 switch(type)
14525 {
14526 case 'o':
14527 case 'x':
14528 case 'X':
14529 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014530 "%%%c format: an integer is required, "
14531 "not %.200s",
14532 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014533 break;
14534 default:
14535 PyErr_Format(PyExc_TypeError,
Victor Stinner998b8062018-09-12 00:23:25 +020014536 "%%%c format: a number is required, "
14537 "not %.200s",
14538 type, Py_TYPE(v)->tp_name);
Ethan Furman9ab74802014-03-21 06:38:46 -070014539 break;
14540 }
Victor Stinner621ef3d2012-10-02 00:33:47 +020014541 return -1;
14542}
14543
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014544static Py_UCS4
14545formatchar(PyObject *v)
Guido van Rossumd57fd912000-03-10 22:53:23 +000014546{
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014547 /* presume that the buffer is at least 3 characters long */
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014548 if (PyUnicode_Check(v)) {
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020014549 if (PyUnicode_GET_LENGTH(v) == 1) {
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014550 return PyUnicode_READ_CHAR(v, 0);
Benjamin Peterson29060642009-01-31 22:14:21 +000014551 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014552 goto onError;
14553 }
14554 else {
Ethan Furmandf3ed242014-01-05 06:50:30 -080014555 PyObject *iobj;
Benjamin Peterson29060642009-01-31 22:14:21 +000014556 long x;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014557 /* make sure number is a type of integer */
14558 if (!PyLong_Check(v)) {
14559 iobj = PyNumber_Index(v);
14560 if (iobj == NULL) {
Ethan Furman38d872e2014-03-19 08:38:52 -070014561 goto onError;
Ethan Furmandf3ed242014-01-05 06:50:30 -080014562 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014563 x = PyLong_AsLong(iobj);
Ethan Furmandf3ed242014-01-05 06:50:30 -080014564 Py_DECREF(iobj);
14565 }
Xiang Zhangea1cf872016-12-22 15:30:47 +080014566 else {
14567 x = PyLong_AsLong(v);
14568 }
Benjamin Peterson29060642009-01-31 22:14:21 +000014569 if (x == -1 && PyErr_Occurred())
14570 goto onError;
14571
Victor Stinner8faf8212011-12-08 22:14:11 +010014572 if (x < 0 || x > MAX_UNICODE) {
Benjamin Peterson29060642009-01-31 22:14:21 +000014573 PyErr_SetString(PyExc_OverflowError,
14574 "%c arg not in range(0x110000)");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014575 return (Py_UCS4) -1;
Benjamin Peterson29060642009-01-31 22:14:21 +000014576 }
14577
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014578 return (Py_UCS4) x;
Benjamin Peterson14339b62009-01-31 16:36:08 +000014579 }
Amaury Forgeot d'Arca4db6862008-07-04 21:26:43 +000014580
Benjamin Peterson29060642009-01-31 22:14:21 +000014581 onError:
Marc-André Lemburgd4ab4a52000-06-08 17:54:00 +000014582 PyErr_SetString(PyExc_TypeError,
Benjamin Peterson29060642009-01-31 22:14:21 +000014583 "%c requires int or char");
Antoine Pitrou5c0ba362011-10-07 01:54:09 +020014584 return (Py_UCS4) -1;
Guido van Rossumd57fd912000-03-10 22:53:23 +000014585}
14586
Victor Stinnera47082312012-10-04 02:19:54 +020014587/* Parse options of an argument: flags, width, precision.
14588 Handle also "%(name)" syntax.
14589
14590 Return 0 if the argument has been formatted into arg->str.
14591 Return 1 if the argument has been written into ctx->writer,
14592 Raise an exception and return -1 on error. */
14593static int
14594unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14595 struct unicode_format_arg_t *arg)
14596{
14597#define FORMAT_READ(ctx) \
14598 PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14599
14600 PyObject *v;
14601
Victor Stinnera47082312012-10-04 02:19:54 +020014602 if (arg->ch == '(') {
14603 /* Get argument value from a dictionary. Example: "%(name)s". */
14604 Py_ssize_t keystart;
14605 Py_ssize_t keylen;
14606 PyObject *key;
14607 int pcount = 1;
14608
14609 if (ctx->dict == NULL) {
14610 PyErr_SetString(PyExc_TypeError,
14611 "format requires a mapping");
14612 return -1;
14613 }
14614 ++ctx->fmtpos;
14615 --ctx->fmtcnt;
14616 keystart = ctx->fmtpos;
14617 /* Skip over balanced parentheses */
14618 while (pcount > 0 && --ctx->fmtcnt >= 0) {
14619 arg->ch = FORMAT_READ(ctx);
14620 if (arg->ch == ')')
14621 --pcount;
14622 else if (arg->ch == '(')
14623 ++pcount;
14624 ctx->fmtpos++;
14625 }
14626 keylen = ctx->fmtpos - keystart - 1;
14627 if (ctx->fmtcnt < 0 || pcount > 0) {
14628 PyErr_SetString(PyExc_ValueError,
14629 "incomplete format key");
14630 return -1;
14631 }
14632 key = PyUnicode_Substring(ctx->fmtstr,
14633 keystart, keystart + keylen);
14634 if (key == NULL)
14635 return -1;
14636 if (ctx->args_owned) {
Victor Stinnera47082312012-10-04 02:19:54 +020014637 ctx->args_owned = 0;
Serhiy Storchaka191321d2015-12-27 15:41:34 +020014638 Py_DECREF(ctx->args);
Victor Stinnera47082312012-10-04 02:19:54 +020014639 }
14640 ctx->args = PyObject_GetItem(ctx->dict, key);
14641 Py_DECREF(key);
14642 if (ctx->args == NULL)
14643 return -1;
14644 ctx->args_owned = 1;
14645 ctx->arglen = -1;
14646 ctx->argidx = -2;
14647 }
14648
14649 /* Parse flags. Example: "%+i" => flags=F_SIGN. */
Victor Stinnera47082312012-10-04 02:19:54 +020014650 while (--ctx->fmtcnt >= 0) {
14651 arg->ch = FORMAT_READ(ctx);
14652 ctx->fmtpos++;
14653 switch (arg->ch) {
14654 case '-': arg->flags |= F_LJUST; continue;
14655 case '+': arg->flags |= F_SIGN; continue;
14656 case ' ': arg->flags |= F_BLANK; continue;
14657 case '#': arg->flags |= F_ALT; continue;
14658 case '0': arg->flags |= F_ZERO; continue;
14659 }
14660 break;
14661 }
14662
14663 /* Parse width. Example: "%10s" => width=10 */
Victor Stinnera47082312012-10-04 02:19:54 +020014664 if (arg->ch == '*') {
14665 v = unicode_format_getnextarg(ctx);
14666 if (v == NULL)
14667 return -1;
14668 if (!PyLong_Check(v)) {
14669 PyErr_SetString(PyExc_TypeError,
14670 "* wants int");
14671 return -1;
14672 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014673 arg->width = PyLong_AsSsize_t(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014674 if (arg->width == -1 && PyErr_Occurred())
14675 return -1;
14676 if (arg->width < 0) {
14677 arg->flags |= F_LJUST;
14678 arg->width = -arg->width;
14679 }
14680 if (--ctx->fmtcnt >= 0) {
14681 arg->ch = FORMAT_READ(ctx);
14682 ctx->fmtpos++;
14683 }
14684 }
14685 else if (arg->ch >= '0' && arg->ch <= '9') {
14686 arg->width = arg->ch - '0';
14687 while (--ctx->fmtcnt >= 0) {
14688 arg->ch = FORMAT_READ(ctx);
14689 ctx->fmtpos++;
14690 if (arg->ch < '0' || arg->ch > '9')
14691 break;
14692 /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14693 mixing signed and unsigned comparison. Since arg->ch is between
14694 '0' and '9', casting to int is safe. */
14695 if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14696 PyErr_SetString(PyExc_ValueError,
14697 "width too big");
14698 return -1;
14699 }
14700 arg->width = arg->width*10 + (arg->ch - '0');
14701 }
14702 }
14703
14704 /* Parse precision. Example: "%.3f" => prec=3 */
Victor Stinnera47082312012-10-04 02:19:54 +020014705 if (arg->ch == '.') {
14706 arg->prec = 0;
14707 if (--ctx->fmtcnt >= 0) {
14708 arg->ch = FORMAT_READ(ctx);
14709 ctx->fmtpos++;
14710 }
14711 if (arg->ch == '*') {
14712 v = unicode_format_getnextarg(ctx);
14713 if (v == NULL)
14714 return -1;
14715 if (!PyLong_Check(v)) {
14716 PyErr_SetString(PyExc_TypeError,
14717 "* wants int");
14718 return -1;
14719 }
Serhiy Storchaka78980432013-01-15 01:12:17 +020014720 arg->prec = _PyLong_AsInt(v);
Victor Stinnera47082312012-10-04 02:19:54 +020014721 if (arg->prec == -1 && PyErr_Occurred())
14722 return -1;
14723 if (arg->prec < 0)
14724 arg->prec = 0;
14725 if (--ctx->fmtcnt >= 0) {
14726 arg->ch = FORMAT_READ(ctx);
14727 ctx->fmtpos++;
14728 }
14729 }
14730 else if (arg->ch >= '0' && arg->ch <= '9') {
14731 arg->prec = arg->ch - '0';
14732 while (--ctx->fmtcnt >= 0) {
14733 arg->ch = FORMAT_READ(ctx);
14734 ctx->fmtpos++;
14735 if (arg->ch < '0' || arg->ch > '9')
14736 break;
14737 if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14738 PyErr_SetString(PyExc_ValueError,
Victor Stinner3921e902012-10-06 23:05:00 +020014739 "precision too big");
Victor Stinnera47082312012-10-04 02:19:54 +020014740 return -1;
14741 }
14742 arg->prec = arg->prec*10 + (arg->ch - '0');
14743 }
14744 }
14745 }
14746
14747 /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14748 if (ctx->fmtcnt >= 0) {
14749 if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14750 if (--ctx->fmtcnt >= 0) {
14751 arg->ch = FORMAT_READ(ctx);
14752 ctx->fmtpos++;
14753 }
14754 }
14755 }
14756 if (ctx->fmtcnt < 0) {
14757 PyErr_SetString(PyExc_ValueError,
14758 "incomplete format");
14759 return -1;
14760 }
14761 return 0;
14762
14763#undef FORMAT_READ
14764}
14765
14766/* Format one argument. Supported conversion specifiers:
14767
14768 - "s", "r", "a": any type
Ethan Furmandf3ed242014-01-05 06:50:30 -080014769 - "i", "d", "u": int or float
14770 - "o", "x", "X": int
Victor Stinnera47082312012-10-04 02:19:54 +020014771 - "e", "E", "f", "F", "g", "G": float
14772 - "c": int or str (1 character)
14773
Victor Stinner8dbd4212012-12-04 09:30:24 +010014774 When possible, the output is written directly into the Unicode writer
14775 (ctx->writer). A string is created when padding is required.
14776
Victor Stinnera47082312012-10-04 02:19:54 +020014777 Return 0 if the argument has been formatted into *p_str,
14778 1 if the argument has been written into ctx->writer,
Victor Stinner8dbd4212012-12-04 09:30:24 +010014779 -1 on error. */
Victor Stinnera47082312012-10-04 02:19:54 +020014780static int
14781unicode_format_arg_format(struct unicode_formatter_t *ctx,
14782 struct unicode_format_arg_t *arg,
14783 PyObject **p_str)
14784{
14785 PyObject *v;
14786 _PyUnicodeWriter *writer = &ctx->writer;
14787
14788 if (ctx->fmtcnt == 0)
14789 ctx->writer.overallocate = 0;
14790
Victor Stinnera47082312012-10-04 02:19:54 +020014791 v = unicode_format_getnextarg(ctx);
14792 if (v == NULL)
14793 return -1;
14794
Victor Stinnera47082312012-10-04 02:19:54 +020014795
14796 switch (arg->ch) {
Victor Stinnera47082312012-10-04 02:19:54 +020014797 case 's':
14798 case 'r':
14799 case 'a':
14800 if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14801 /* Fast path */
14802 if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14803 return -1;
14804 return 1;
14805 }
14806
14807 if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14808 *p_str = v;
14809 Py_INCREF(*p_str);
14810 }
14811 else {
14812 if (arg->ch == 's')
14813 *p_str = PyObject_Str(v);
14814 else if (arg->ch == 'r')
14815 *p_str = PyObject_Repr(v);
14816 else
14817 *p_str = PyObject_ASCII(v);
14818 }
14819 break;
14820
14821 case 'i':
14822 case 'd':
14823 case 'u':
14824 case 'o':
14825 case 'x':
14826 case 'X':
14827 {
14828 int ret = mainformatlong(v, arg, p_str, writer);
14829 if (ret != 0)
14830 return ret;
14831 arg->sign = 1;
14832 break;
14833 }
14834
14835 case 'e':
14836 case 'E':
14837 case 'f':
14838 case 'F':
14839 case 'g':
14840 case 'G':
14841 if (arg->width == -1 && arg->prec == -1
14842 && !(arg->flags & (F_SIGN | F_BLANK)))
14843 {
14844 /* Fast path */
14845 if (formatfloat(v, arg, NULL, writer) == -1)
14846 return -1;
14847 return 1;
14848 }
14849
14850 arg->sign = 1;
14851 if (formatfloat(v, arg, p_str, NULL) == -1)
14852 return -1;
14853 break;
14854
14855 case 'c':
14856 {
14857 Py_UCS4 ch = formatchar(v);
14858 if (ch == (Py_UCS4) -1)
14859 return -1;
14860 if (arg->width == -1 && arg->prec == -1) {
14861 /* Fast path */
Victor Stinner8a1a6cf2013-04-14 02:35:33 +020014862 if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
Victor Stinnera47082312012-10-04 02:19:54 +020014863 return -1;
Victor Stinnera47082312012-10-04 02:19:54 +020014864 return 1;
14865 }
14866 *p_str = PyUnicode_FromOrdinal(ch);
14867 break;
14868 }
14869
14870 default:
14871 PyErr_Format(PyExc_ValueError,
14872 "unsupported format character '%c' (0x%x) "
Victor Stinnera33bce02014-07-04 22:47:46 +020014873 "at index %zd",
Victor Stinnera47082312012-10-04 02:19:54 +020014874 (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14875 (int)arg->ch,
14876 ctx->fmtpos - 1);
14877 return -1;
14878 }
14879 if (*p_str == NULL)
14880 return -1;
14881 assert (PyUnicode_Check(*p_str));
14882 return 0;
14883}
14884
14885static int
14886unicode_format_arg_output(struct unicode_formatter_t *ctx,
14887 struct unicode_format_arg_t *arg,
14888 PyObject *str)
14889{
14890 Py_ssize_t len;
14891 enum PyUnicode_Kind kind;
14892 void *pbuf;
14893 Py_ssize_t pindex;
14894 Py_UCS4 signchar;
14895 Py_ssize_t buflen;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014896 Py_UCS4 maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014897 Py_ssize_t sublen;
14898 _PyUnicodeWriter *writer = &ctx->writer;
14899 Py_UCS4 fill;
14900
14901 fill = ' ';
14902 if (arg->sign && arg->flags & F_ZERO)
14903 fill = '0';
14904
14905 if (PyUnicode_READY(str) == -1)
14906 return -1;
14907
14908 len = PyUnicode_GET_LENGTH(str);
14909 if ((arg->width == -1 || arg->width <= len)
14910 && (arg->prec == -1 || arg->prec >= len)
14911 && !(arg->flags & (F_SIGN | F_BLANK)))
14912 {
14913 /* Fast path */
14914 if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14915 return -1;
14916 return 0;
14917 }
14918
14919 /* Truncate the string for "s", "r" and "a" formats
14920 if the precision is set */
14921 if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14922 if (arg->prec >= 0 && len > arg->prec)
14923 len = arg->prec;
14924 }
14925
14926 /* Adjust sign and width */
14927 kind = PyUnicode_KIND(str);
14928 pbuf = PyUnicode_DATA(str);
14929 pindex = 0;
14930 signchar = '\0';
14931 if (arg->sign) {
14932 Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14933 if (ch == '-' || ch == '+') {
14934 signchar = ch;
14935 len--;
14936 pindex++;
14937 }
14938 else if (arg->flags & F_SIGN)
14939 signchar = '+';
14940 else if (arg->flags & F_BLANK)
14941 signchar = ' ';
14942 else
14943 arg->sign = 0;
14944 }
14945 if (arg->width < len)
14946 arg->width = len;
14947
14948 /* Prepare the writer */
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014949 maxchar = writer->maxchar;
Victor Stinnera47082312012-10-04 02:19:54 +020014950 if (!(arg->flags & F_LJUST)) {
14951 if (arg->sign) {
14952 if ((arg->width-1) > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014953 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014954 }
14955 else {
14956 if (arg->width > len)
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014957 maxchar = Py_MAX(maxchar, fill);
Victor Stinnera47082312012-10-04 02:19:54 +020014958 }
14959 }
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014960 if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14961 Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
Benjamin Peterson3164f5d2013-06-10 09:24:01 -070014962 maxchar = Py_MAX(maxchar, strmaxchar);
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014963 }
14964
Victor Stinnera47082312012-10-04 02:19:54 +020014965 buflen = arg->width;
14966 if (arg->sign && len == arg->width)
14967 buflen++;
Victor Stinnereb4b5ac2013-04-03 02:02:33 +020014968 if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
Victor Stinnera47082312012-10-04 02:19:54 +020014969 return -1;
14970
14971 /* Write the sign if needed */
14972 if (arg->sign) {
14973 if (fill != ' ') {
14974 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14975 writer->pos += 1;
14976 }
14977 if (arg->width > len)
14978 arg->width--;
14979 }
14980
14981 /* Write the numeric prefix for "x", "X" and "o" formats
14982 if the alternate form is used.
14983 For example, write "0x" for the "%#x" format. */
14984 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14985 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14986 assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14987 if (fill != ' ') {
14988 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14989 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14990 writer->pos += 2;
14991 pindex += 2;
14992 }
14993 arg->width -= 2;
14994 if (arg->width < 0)
14995 arg->width = 0;
14996 len -= 2;
14997 }
14998
14999 /* Pad left with the fill character if needed */
15000 if (arg->width > len && !(arg->flags & F_LJUST)) {
15001 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015002 unicode_fill(writer->kind, writer->data, fill, writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015003 writer->pos += sublen;
15004 arg->width = len;
15005 }
15006
15007 /* If padding with spaces: write sign if needed and/or numeric prefix if
15008 the alternate form is used */
15009 if (fill == ' ') {
15010 if (arg->sign) {
15011 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
15012 writer->pos += 1;
15013 }
15014 if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
15015 assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
15016 assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
15017 PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
15018 PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
15019 writer->pos += 2;
15020 pindex += 2;
15021 }
15022 }
15023
15024 /* Write characters */
15025 if (len) {
15026 _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
15027 str, pindex, len);
15028 writer->pos += len;
15029 }
15030
15031 /* Pad right with the fill character if needed */
15032 if (arg->width > len) {
15033 sublen = arg->width - len;
Victor Stinner59423e32018-11-26 13:40:01 +010015034 unicode_fill(writer->kind, writer->data, ' ', writer->pos, sublen);
Victor Stinnera47082312012-10-04 02:19:54 +020015035 writer->pos += sublen;
15036 }
15037 return 0;
15038}
15039
15040/* Helper of PyUnicode_Format(): format one arg.
15041 Return 0 on success, raise an exception and return -1 on error. */
15042static int
15043unicode_format_arg(struct unicode_formatter_t *ctx)
15044{
15045 struct unicode_format_arg_t arg;
15046 PyObject *str;
15047 int ret;
15048
Victor Stinner8dbd4212012-12-04 09:30:24 +010015049 arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015050 if (arg.ch == '%') {
15051 ctx->fmtpos++;
15052 ctx->fmtcnt--;
15053 if (_PyUnicodeWriter_WriteCharInline(&ctx->writer, '%') < 0)
15054 return -1;
15055 return 0;
15056 }
Victor Stinner8dbd4212012-12-04 09:30:24 +010015057 arg.flags = 0;
15058 arg.width = -1;
15059 arg.prec = -1;
15060 arg.sign = 0;
15061 str = NULL;
15062
Victor Stinnera47082312012-10-04 02:19:54 +020015063 ret = unicode_format_arg_parse(ctx, &arg);
15064 if (ret == -1)
15065 return -1;
15066
15067 ret = unicode_format_arg_format(ctx, &arg, &str);
15068 if (ret == -1)
15069 return -1;
15070
15071 if (ret != 1) {
15072 ret = unicode_format_arg_output(ctx, &arg, str);
15073 Py_DECREF(str);
15074 if (ret == -1)
15075 return -1;
15076 }
15077
Serhiy Storchaka9f8ad3f2017-03-08 05:51:19 +020015078 if (ctx->dict && (ctx->argidx < ctx->arglen)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015079 PyErr_SetString(PyExc_TypeError,
15080 "not all arguments converted during string formatting");
15081 return -1;
15082 }
15083 return 0;
15084}
15085
Alexander Belopolsky40018472011-02-26 01:02:56 +000015086PyObject *
15087PyUnicode_Format(PyObject *format, PyObject *args)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015088{
Victor Stinnera47082312012-10-04 02:19:54 +020015089 struct unicode_formatter_t ctx;
Tim Petersced69f82003-09-16 20:30:58 +000015090
Guido van Rossumd57fd912000-03-10 22:53:23 +000015091 if (format == NULL || args == NULL) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015092 PyErr_BadInternalCall();
15093 return NULL;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015094 }
Victor Stinnera47082312012-10-04 02:19:54 +020015095
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015096 if (ensure_unicode(format) < 0)
Benjamin Peterson29060642009-01-31 22:14:21 +000015097 return NULL;
Serhiy Storchaka21a663e2016-04-13 15:37:23 +030015098
15099 ctx.fmtstr = format;
Victor Stinnera47082312012-10-04 02:19:54 +020015100 ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
15101 ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
15102 ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
15103 ctx.fmtpos = 0;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015104
Victor Stinner8f674cc2013-04-17 23:02:17 +020015105 _PyUnicodeWriter_Init(&ctx.writer);
15106 ctx.writer.min_length = ctx.fmtcnt + 100;
15107 ctx.writer.overallocate = 1;
Victor Stinnerf2c76aa2012-05-03 13:10:40 +020015108
Guido van Rossumd57fd912000-03-10 22:53:23 +000015109 if (PyTuple_Check(args)) {
Victor Stinnera47082312012-10-04 02:19:54 +020015110 ctx.arglen = PyTuple_Size(args);
15111 ctx.argidx = 0;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015112 }
15113 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015114 ctx.arglen = -1;
15115 ctx.argidx = -2;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015116 }
Victor Stinnera47082312012-10-04 02:19:54 +020015117 ctx.args_owned = 0;
Benjamin Peterson28a6cfa2012-08-28 17:55:35 -040015118 if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
Victor Stinnera47082312012-10-04 02:19:54 +020015119 ctx.dict = args;
15120 else
15121 ctx.dict = NULL;
15122 ctx.args = args;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015123
Victor Stinnera47082312012-10-04 02:19:54 +020015124 while (--ctx.fmtcnt >= 0) {
15125 if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
Victor Stinnercfc4c132013-04-03 01:48:39 +020015126 Py_ssize_t nonfmtpos;
Victor Stinnera47082312012-10-04 02:19:54 +020015127
15128 nonfmtpos = ctx.fmtpos++;
15129 while (ctx.fmtcnt >= 0 &&
15130 PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
15131 ctx.fmtpos++;
15132 ctx.fmtcnt--;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015133 }
Victor Stinnera47082312012-10-04 02:19:54 +020015134 if (ctx.fmtcnt < 0) {
15135 ctx.fmtpos--;
15136 ctx.writer.overallocate = 0;
Victor Stinnera0494432012-10-03 23:03:46 +020015137 }
Victor Stinneree4544c2012-05-09 22:24:08 +020015138
Victor Stinnercfc4c132013-04-03 01:48:39 +020015139 if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
15140 nonfmtpos, ctx.fmtpos) < 0)
15141 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015142 }
15143 else {
Victor Stinnera47082312012-10-04 02:19:54 +020015144 ctx.fmtpos++;
15145 if (unicode_format_arg(&ctx) == -1)
Benjamin Peterson29060642009-01-31 22:14:21 +000015146 goto onError;
Victor Stinnera47082312012-10-04 02:19:54 +020015147 }
15148 }
Victor Stinneraff3cc62012-04-30 05:19:21 +020015149
Victor Stinnera47082312012-10-04 02:19:54 +020015150 if (ctx.argidx < ctx.arglen && !ctx.dict) {
Benjamin Peterson29060642009-01-31 22:14:21 +000015151 PyErr_SetString(PyExc_TypeError,
15152 "not all arguments converted during string formatting");
15153 goto onError;
Guido van Rossumd57fd912000-03-10 22:53:23 +000015154 }
15155
Victor Stinnera47082312012-10-04 02:19:54 +020015156 if (ctx.args_owned) {
15157 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015158 }
Victor Stinnera47082312012-10-04 02:19:54 +020015159 return _PyUnicodeWriter_Finish(&ctx.writer);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015160
Benjamin Peterson29060642009-01-31 22:14:21 +000015161 onError:
Victor Stinnera47082312012-10-04 02:19:54 +020015162 _PyUnicodeWriter_Dealloc(&ctx.writer);
15163 if (ctx.args_owned) {
15164 Py_DECREF(ctx.args);
Guido van Rossumd57fd912000-03-10 22:53:23 +000015165 }
15166 return NULL;
15167}
15168
Jeremy Hylton938ace62002-07-17 16:30:39 +000015169static PyObject *
Guido van Rossume023fe02001-08-30 03:12:59 +000015170unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
15171
Tim Peters6d6c1a32001-08-02 04:15:00 +000015172static PyObject *
15173unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15174{
Benjamin Peterson29060642009-01-31 22:14:21 +000015175 PyObject *x = NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015176 static char *kwlist[] = {"object", "encoding", "errors", 0};
15177 char *encoding = NULL;
15178 char *errors = NULL;
Tim Peters6d6c1a32001-08-02 04:15:00 +000015179
Benjamin Peterson14339b62009-01-31 16:36:08 +000015180 if (type != &PyUnicode_Type)
15181 return unicode_subtype_new(type, args, kwds);
15182 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
Benjamin Peterson29060642009-01-31 22:14:21 +000015183 kwlist, &x, &encoding, &errors))
Benjamin Peterson14339b62009-01-31 16:36:08 +000015184 return NULL;
15185 if (x == NULL)
Serhiy Storchaka678db842013-01-26 12:16:36 +020015186 _Py_RETURN_UNICODE_EMPTY();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015187 if (encoding == NULL && errors == NULL)
15188 return PyObject_Str(x);
15189 else
Benjamin Peterson29060642009-01-31 22:14:21 +000015190 return PyUnicode_FromEncodedObject(x, encoding, errors);
Tim Peters6d6c1a32001-08-02 04:15:00 +000015191}
15192
Guido van Rossume023fe02001-08-30 03:12:59 +000015193static PyObject *
15194unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
15195{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015196 PyObject *unicode, *self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015197 Py_ssize_t length, char_size;
15198 int share_wstr, share_utf8;
15199 unsigned int kind;
15200 void *data;
Guido van Rossume023fe02001-08-30 03:12:59 +000015201
Benjamin Peterson14339b62009-01-31 16:36:08 +000015202 assert(PyType_IsSubtype(type, &PyUnicode_Type));
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015203
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015204 unicode = unicode_new(&PyUnicode_Type, args, kwds);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015205 if (unicode == NULL)
Benjamin Peterson14339b62009-01-31 16:36:08 +000015206 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015207 assert(_PyUnicode_CHECK(unicode));
Benjamin Petersonbac79492012-01-14 13:34:47 -050015208 if (PyUnicode_READY(unicode) == -1) {
Benjamin Peterson22a29702012-01-02 09:00:30 -060015209 Py_DECREF(unicode);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015210 return NULL;
Benjamin Peterson22a29702012-01-02 09:00:30 -060015211 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015212
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015213 self = type->tp_alloc(type, 0);
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015214 if (self == NULL) {
15215 Py_DECREF(unicode);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015216 return NULL;
15217 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015218 kind = PyUnicode_KIND(unicode);
15219 length = PyUnicode_GET_LENGTH(unicode);
15220
15221 _PyUnicode_LENGTH(self) = length;
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015222#ifdef Py_DEBUG
15223 _PyUnicode_HASH(self) = -1;
15224#else
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015225 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015226#endif
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015227 _PyUnicode_STATE(self).interned = 0;
15228 _PyUnicode_STATE(self).kind = kind;
15229 _PyUnicode_STATE(self).compact = 0;
Victor Stinner3cf46372011-10-03 14:42:15 +020015230 _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015231 _PyUnicode_STATE(self).ready = 1;
15232 _PyUnicode_WSTR(self) = NULL;
15233 _PyUnicode_UTF8_LENGTH(self) = 0;
15234 _PyUnicode_UTF8(self) = NULL;
15235 _PyUnicode_WSTR_LENGTH(self) = 0;
Victor Stinnerc3c74152011-10-02 20:39:55 +020015236 _PyUnicode_DATA_ANY(self) = NULL;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015237
15238 share_utf8 = 0;
15239 share_wstr = 0;
15240 if (kind == PyUnicode_1BYTE_KIND) {
15241 char_size = 1;
15242 if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
15243 share_utf8 = 1;
15244 }
15245 else if (kind == PyUnicode_2BYTE_KIND) {
15246 char_size = 2;
15247 if (sizeof(wchar_t) == 2)
15248 share_wstr = 1;
15249 }
15250 else {
15251 assert(kind == PyUnicode_4BYTE_KIND);
15252 char_size = 4;
15253 if (sizeof(wchar_t) == 4)
15254 share_wstr = 1;
15255 }
15256
15257 /* Ensure we won't overflow the length. */
15258 if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
15259 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015260 goto onError;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015261 }
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015262 data = PyObject_MALLOC((length + 1) * char_size);
15263 if (data == NULL) {
15264 PyErr_NoMemory();
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015265 goto onError;
15266 }
15267
Victor Stinnerc3c74152011-10-02 20:39:55 +020015268 _PyUnicode_DATA_ANY(self) = data;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015269 if (share_utf8) {
15270 _PyUnicode_UTF8_LENGTH(self) = length;
15271 _PyUnicode_UTF8(self) = data;
15272 }
15273 if (share_wstr) {
15274 _PyUnicode_WSTR_LENGTH(self) = length;
15275 _PyUnicode_WSTR(self) = (wchar_t *)data;
15276 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015277
Christian Heimesf051e432016-09-13 20:22:02 +020015278 memcpy(data, PyUnicode_DATA(unicode),
Martin v. Löwisc47adb02011-10-07 20:55:35 +020015279 kind * (length + 1));
Victor Stinnerbb10a1f2011-10-05 01:34:17 +020015280 assert(_PyUnicode_CheckConsistency(self, 1));
Victor Stinnerfb9ea8c2011-10-06 01:45:57 +020015281#ifdef Py_DEBUG
15282 _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
15283#endif
Victor Stinnerdd18d3a2011-10-22 11:08:10 +020015284 Py_DECREF(unicode);
Victor Stinner7931d9a2011-11-04 00:22:48 +010015285 return self;
Victor Stinner07ac3eb2011-10-01 16:16:43 +020015286
15287onError:
15288 Py_DECREF(unicode);
15289 Py_DECREF(self);
15290 return NULL;
Guido van Rossume023fe02001-08-30 03:12:59 +000015291}
15292
Martin v. Löwis14f8b4c2002-06-13 20:33:02 +000015293PyDoc_STRVAR(unicode_doc,
Chris Jerdonek83fe2e12012-10-07 14:48:36 -070015294"str(object='') -> str\n\
15295str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
Tim Peters6d6c1a32001-08-02 04:15:00 +000015296\n\
Nick Coghlan573b1fd2012-08-16 14:13:07 +100015297Create a new string object from the given object. If encoding or\n\
15298errors is specified, then the object must expose a data buffer\n\
15299that will be decoded using the given encoding and error handler.\n\
15300Otherwise, returns the result of object.__str__() (if defined)\n\
15301or repr(object).\n\
15302encoding defaults to sys.getdefaultencoding().\n\
15303errors defaults to 'strict'.");
Tim Peters6d6c1a32001-08-02 04:15:00 +000015304
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015305static PyObject *unicode_iter(PyObject *seq);
15306
Guido van Rossumd57fd912000-03-10 22:53:23 +000015307PyTypeObject PyUnicode_Type = {
Martin v. Löwis9f2e3462007-07-21 17:22:18 +000015308 PyVarObject_HEAD_INIT(&PyType_Type, 0)
Bupfc93bd42018-06-19 03:59:55 -050015309 "str", /* tp_name */
15310 sizeof(PyUnicodeObject), /* tp_basicsize */
15311 0, /* tp_itemsize */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015312 /* Slots */
Bupfc93bd42018-06-19 03:59:55 -050015313 (destructor)unicode_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015314 0, /* tp_vectorcall_offset */
Bupfc93bd42018-06-19 03:59:55 -050015315 0, /* tp_getattr */
15316 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015317 0, /* tp_as_async */
Bupfc93bd42018-06-19 03:59:55 -050015318 unicode_repr, /* tp_repr */
15319 &unicode_as_number, /* tp_as_number */
15320 &unicode_as_sequence, /* tp_as_sequence */
15321 &unicode_as_mapping, /* tp_as_mapping */
15322 (hashfunc) unicode_hash, /* tp_hash*/
15323 0, /* tp_call*/
15324 (reprfunc) unicode_str, /* tp_str */
15325 PyObject_GenericGetAttr, /* tp_getattro */
15326 0, /* tp_setattro */
15327 0, /* tp_as_buffer */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015328 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
Bupfc93bd42018-06-19 03:59:55 -050015329 Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
15330 unicode_doc, /* tp_doc */
15331 0, /* tp_traverse */
15332 0, /* tp_clear */
15333 PyUnicode_RichCompare, /* tp_richcompare */
15334 0, /* tp_weaklistoffset */
15335 unicode_iter, /* tp_iter */
15336 0, /* tp_iternext */
15337 unicode_methods, /* tp_methods */
15338 0, /* tp_members */
15339 0, /* tp_getset */
15340 &PyBaseObject_Type, /* tp_base */
15341 0, /* tp_dict */
15342 0, /* tp_descr_get */
15343 0, /* tp_descr_set */
15344 0, /* tp_dictoffset */
15345 0, /* tp_init */
15346 0, /* tp_alloc */
15347 unicode_new, /* tp_new */
15348 PyObject_Del, /* tp_free */
Guido van Rossumd57fd912000-03-10 22:53:23 +000015349};
15350
15351/* Initialize the Unicode implementation */
15352
Victor Stinner331a6a52019-05-27 16:39:22 +020015353PyStatus
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015354_PyUnicode_Init(void)
Guido van Rossumd57fd912000-03-10 22:53:23 +000015355{
Thomas Wouters477c8d52006-05-27 19:21:47 +000015356 /* XXX - move this array to unicodectype.c ? */
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015357 Py_UCS2 linebreak[] = {
Thomas Wouters477c8d52006-05-27 19:21:47 +000015358 0x000A, /* LINE FEED */
15359 0x000D, /* CARRIAGE RETURN */
15360 0x001C, /* FILE SEPARATOR */
15361 0x001D, /* GROUP SEPARATOR */
15362 0x001E, /* RECORD SEPARATOR */
15363 0x0085, /* NEXT LINE */
15364 0x2028, /* LINE SEPARATOR */
15365 0x2029, /* PARAGRAPH SEPARATOR */
15366 };
15367
Fred Drakee4315f52000-05-09 19:53:39 +000015368 /* Init the implementation */
Serhiy Storchaka678db842013-01-26 12:16:36 +020015369 _Py_INCREF_UNICODE_EMPTY();
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015370 if (!unicode_empty) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015371 return _PyStatus_ERR("Can't create empty string");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015372 }
Serhiy Storchaka678db842013-01-26 12:16:36 +020015373 Py_DECREF(unicode_empty);
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015374
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015375 if (PyType_Ready(&PyUnicode_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015376 return _PyStatus_ERR("Can't initialize unicode type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015377 }
Thomas Wouters477c8d52006-05-27 19:21:47 +000015378
15379 /* initialize the linebreak bloom filter */
15380 bloom_linebreak = make_bloom_mask(
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015381 PyUnicode_2BYTE_KIND, linebreak,
Victor Stinner63941882011-09-29 00:42:28 +020015382 Py_ARRAY_LENGTH(linebreak));
Thomas Wouters0e3f5912006-08-11 14:57:12 +000015383
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015384 if (PyType_Ready(&EncodingMapType) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015385 return _PyStatus_ERR("Can't initialize encoding map type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015386 }
15387 if (PyType_Ready(&PyFieldNameIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015388 return _PyStatus_ERR("Can't initialize field name iterator type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015389 }
15390 if (PyType_Ready(&PyFormatterIter_Type) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015391 return _PyStatus_ERR("Can't initialize formatter iter type");
Victor Stinnerbf4ac2d2019-01-22 17:39:03 +010015392 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015393 return _PyStatus_OK();
Guido van Rossumd57fd912000-03-10 22:53:23 +000015394}
15395
Martin v. Löwis9a3a9f72003-05-18 12:31:09 +000015396
Walter Dörwald16807132007-05-25 13:52:07 +000015397void
15398PyUnicode_InternInPlace(PyObject **p)
15399{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015400 PyObject *s = *p;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015401 PyObject *t;
Victor Stinner4fae54c2011-10-03 02:01:52 +020015402#ifdef Py_DEBUG
15403 assert(s != NULL);
15404 assert(_PyUnicode_CHECK(s));
15405#else
Benjamin Peterson14339b62009-01-31 16:36:08 +000015406 if (s == NULL || !PyUnicode_Check(s))
Victor Stinner4fae54c2011-10-03 02:01:52 +020015407 return;
15408#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015409 /* If it's a subclass, we don't really know what putting
15410 it in the interned dict might do. */
15411 if (!PyUnicode_CheckExact(s))
15412 return;
15413 if (PyUnicode_CHECK_INTERNED(s))
15414 return;
15415 if (interned == NULL) {
15416 interned = PyDict_New();
15417 if (interned == NULL) {
15418 PyErr_Clear(); /* Don't leave an exception */
15419 return;
15420 }
15421 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015422 Py_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015423 t = PyDict_SetDefault(interned, s, s);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015424 Py_END_ALLOW_RECURSION
Berker Peksagced8d4c2016-07-25 04:40:39 +030015425 if (t == NULL) {
15426 PyErr_Clear();
15427 return;
15428 }
15429 if (t != s) {
Victor Stinnerf0335102013-04-14 19:13:03 +020015430 Py_INCREF(t);
Serhiy Storchaka57a01d32016-04-10 18:05:40 +030015431 Py_SETREF(*p, t);
Victor Stinnerf0335102013-04-14 19:13:03 +020015432 return;
15433 }
Benjamin Peterson14339b62009-01-31 16:36:08 +000015434 /* The two references in interned are not counted by refcnt.
15435 The deallocator will take care of this */
Victor Stinnerc86a1122020-02-07 01:24:29 +010015436 Py_SET_REFCNT(s, Py_REFCNT(s) - 2);
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015437 _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
Walter Dörwald16807132007-05-25 13:52:07 +000015438}
15439
15440void
15441PyUnicode_InternImmortal(PyObject **p)
15442{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015443 PyUnicode_InternInPlace(p);
15444 if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
Victor Stinneraf9e4b82011-10-23 20:07:00 +020015445 _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015446 Py_INCREF(*p);
15447 }
Walter Dörwald16807132007-05-25 13:52:07 +000015448}
15449
15450PyObject *
15451PyUnicode_InternFromString(const char *cp)
15452{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015453 PyObject *s = PyUnicode_FromString(cp);
15454 if (s == NULL)
15455 return NULL;
15456 PyUnicode_InternInPlace(&s);
15457 return s;
Walter Dörwald16807132007-05-25 13:52:07 +000015458}
15459
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015460
15461#if defined(WITH_VALGRIND) || defined(__INSURE__)
15462static void
15463unicode_release_interned(void)
Walter Dörwald16807132007-05-25 13:52:07 +000015464{
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015465 if (interned == NULL || !PyDict_Check(interned)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015466 return;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015467 }
15468 PyObject *keys = PyDict_Keys(interned);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015469 if (keys == NULL || !PyList_Check(keys)) {
15470 PyErr_Clear();
15471 return;
15472 }
Walter Dörwald16807132007-05-25 13:52:07 +000015473
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015474 /* Since unicode_release_interned() is intended to help a leak
Benjamin Peterson14339b62009-01-31 16:36:08 +000015475 detector, interned unicode strings are not forcibly deallocated;
15476 rather, we give them their stolen references back, and then clear
15477 and DECREF the interned dict. */
Walter Dörwald16807132007-05-25 13:52:07 +000015478
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015479 Py_ssize_t n = PyList_GET_SIZE(keys);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015480#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015481 fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
Benjamin Peterson29060642009-01-31 22:14:21 +000015482 n);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015483
15484 Py_ssize_t immortal_size = 0, mortal_size = 0;
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015485#endif
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015486 for (Py_ssize_t i = 0; i < n; i++) {
15487 PyObject *s = PyList_GET_ITEM(keys, i);
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015488 if (PyUnicode_READY(s) == -1) {
Barry Warsawb2e57942017-09-14 18:13:16 -070015489 Py_UNREACHABLE();
Victor Stinner6b56a7f2011-10-04 20:04:52 +020015490 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015491 switch (PyUnicode_CHECK_INTERNED(s)) {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015492 case SSTATE_INTERNED_IMMORTAL:
15493 Py_REFCNT(s) += 1;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015494#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015495 immortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015496#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015497 break;
15498 case SSTATE_INTERNED_MORTAL:
15499 Py_REFCNT(s) += 2;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015500#ifdef INTERNED_STATS
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015501 mortal_size += PyUnicode_GET_LENGTH(s);
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015502#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015503 break;
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015504 case SSTATE_NOT_INTERNED:
15505 /* fall through */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015506 default:
Victor Stinnerec3c99c2020-01-30 12:18:32 +010015507 Py_UNREACHABLE();
Benjamin Peterson14339b62009-01-31 16:36:08 +000015508 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015509 _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015510 }
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015511#ifdef INTERNED_STATS
Benjamin Peterson14339b62009-01-31 16:36:08 +000015512 fprintf(stderr, "total size of all interned strings: "
15513 "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15514 "mortal/immortal\n", mortal_size, immortal_size);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015515#endif
Benjamin Peterson14339b62009-01-31 16:36:08 +000015516 Py_DECREF(keys);
15517 PyDict_Clear(interned);
Serhiy Storchaka05997252013-01-26 12:14:02 +020015518 Py_CLEAR(interned);
Walter Dörwald16807132007-05-25 13:52:07 +000015519}
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015520#endif
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015521
15522
15523/********************* Unicode Iterator **************************/
15524
15525typedef struct {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015526 PyObject_HEAD
15527 Py_ssize_t it_index;
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015528 PyObject *it_seq; /* Set to NULL when iterator is exhausted */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015529} unicodeiterobject;
15530
15531static void
15532unicodeiter_dealloc(unicodeiterobject *it)
15533{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015534 _PyObject_GC_UNTRACK(it);
15535 Py_XDECREF(it->it_seq);
15536 PyObject_GC_Del(it);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015537}
15538
15539static int
15540unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15541{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015542 Py_VISIT(it->it_seq);
15543 return 0;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015544}
15545
15546static PyObject *
15547unicodeiter_next(unicodeiterobject *it)
15548{
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015549 PyObject *seq, *item;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015550
Benjamin Peterson14339b62009-01-31 16:36:08 +000015551 assert(it != NULL);
15552 seq = it->it_seq;
15553 if (seq == NULL)
15554 return NULL;
Victor Stinner910337b2011-10-03 03:20:16 +020015555 assert(_PyUnicode_CHECK(seq));
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015556
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015557 if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15558 int kind = PyUnicode_KIND(seq);
15559 void *data = PyUnicode_DATA(seq);
15560 Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15561 item = PyUnicode_FromOrdinal(chr);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015562 if (item != NULL)
15563 ++it->it_index;
15564 return item;
15565 }
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015566
Benjamin Peterson14339b62009-01-31 16:36:08 +000015567 it->it_seq = NULL;
Serhiy Storchakafbb1c5e2016-03-30 20:40:02 +030015568 Py_DECREF(seq);
Benjamin Peterson14339b62009-01-31 16:36:08 +000015569 return NULL;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015570}
15571
15572static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015573unicodeiter_len(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015574{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015575 Py_ssize_t len = 0;
15576 if (it->it_seq)
Victor Stinnerc4f281e2011-10-11 22:11:42 +020015577 len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015578 return PyLong_FromSsize_t(len);
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015579}
15580
15581PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15582
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015583static PyObject *
Siddhesh Poyarekar55edd0c2018-04-30 00:29:33 +053015584unicodeiter_reduce(unicodeiterobject *it, PyObject *Py_UNUSED(ignored))
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015585{
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015586 _Py_IDENTIFIER(iter);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015587 if (it->it_seq != NULL) {
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015588 return Py_BuildValue("N(O)n", _PyEval_GetBuiltinId(&PyId_iter),
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015589 it->it_seq, it->it_index);
15590 } else {
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015591 PyObject *u = (PyObject *)_PyUnicode_New(0);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015592 if (u == NULL)
15593 return NULL;
Serhiy Storchakabb86bf42018-12-11 08:28:18 +020015594 return Py_BuildValue("N(N)", _PyEval_GetBuiltinId(&PyId_iter), u);
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015595 }
15596}
15597
15598PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15599
15600static PyObject *
15601unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15602{
15603 Py_ssize_t index = PyLong_AsSsize_t(state);
15604 if (index == -1 && PyErr_Occurred())
15605 return NULL;
Kristján Valur Jónsson25dded02014-03-05 13:47:57 +000015606 if (it->it_seq != NULL) {
15607 if (index < 0)
15608 index = 0;
15609 else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15610 index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15611 it->it_index = index;
15612 }
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015613 Py_RETURN_NONE;
15614}
15615
15616PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15617
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015618static PyMethodDef unicodeiter_methods[] = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015619 {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
Benjamin Peterson29060642009-01-31 22:14:21 +000015620 length_hint_doc},
Kristján Valur Jónsson31668b82012-04-03 10:49:41 +000015621 {"__reduce__", (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15622 reduce_doc},
15623 {"__setstate__", (PyCFunction)unicodeiter_setstate, METH_O,
15624 setstate_doc},
Benjamin Peterson14339b62009-01-31 16:36:08 +000015625 {NULL, NULL} /* sentinel */
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015626};
15627
15628PyTypeObject PyUnicodeIter_Type = {
Benjamin Peterson14339b62009-01-31 16:36:08 +000015629 PyVarObject_HEAD_INIT(&PyType_Type, 0)
15630 "str_iterator", /* tp_name */
15631 sizeof(unicodeiterobject), /* tp_basicsize */
15632 0, /* tp_itemsize */
15633 /* methods */
15634 (destructor)unicodeiter_dealloc, /* tp_dealloc */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015635 0, /* tp_vectorcall_offset */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015636 0, /* tp_getattr */
15637 0, /* tp_setattr */
Jeroen Demeyer530f5062019-05-31 04:13:39 +020015638 0, /* tp_as_async */
Benjamin Peterson14339b62009-01-31 16:36:08 +000015639 0, /* tp_repr */
15640 0, /* tp_as_number */
15641 0, /* tp_as_sequence */
15642 0, /* tp_as_mapping */
15643 0, /* tp_hash */
15644 0, /* tp_call */
15645 0, /* tp_str */
15646 PyObject_GenericGetAttr, /* tp_getattro */
15647 0, /* tp_setattro */
15648 0, /* tp_as_buffer */
15649 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15650 0, /* tp_doc */
15651 (traverseproc)unicodeiter_traverse, /* tp_traverse */
15652 0, /* tp_clear */
15653 0, /* tp_richcompare */
15654 0, /* tp_weaklistoffset */
15655 PyObject_SelfIter, /* tp_iter */
15656 (iternextfunc)unicodeiter_next, /* tp_iternext */
15657 unicodeiter_methods, /* tp_methods */
15658 0,
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015659};
15660
15661static PyObject *
15662unicode_iter(PyObject *seq)
15663{
Benjamin Peterson14339b62009-01-31 16:36:08 +000015664 unicodeiterobject *it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015665
Benjamin Peterson14339b62009-01-31 16:36:08 +000015666 if (!PyUnicode_Check(seq)) {
15667 PyErr_BadInternalCall();
15668 return NULL;
15669 }
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015670 if (PyUnicode_READY(seq) == -1)
15671 return NULL;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015672 it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15673 if (it == NULL)
15674 return NULL;
15675 it->it_index = 0;
15676 Py_INCREF(seq);
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015677 it->it_seq = seq;
Benjamin Peterson14339b62009-01-31 16:36:08 +000015678 _PyObject_GC_TRACK(it);
15679 return (PyObject *)it;
Guido van Rossum50e9fb92006-08-17 05:42:55 +000015680}
15681
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015682
15683size_t
15684Py_UNICODE_strlen(const Py_UNICODE *u)
15685{
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015686 return wcslen(u);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015687}
15688
15689Py_UNICODE*
15690Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15691{
15692 Py_UNICODE *u = s1;
15693 while ((*u++ = *s2++));
15694 return s1;
15695}
15696
15697Py_UNICODE*
15698Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15699{
15700 Py_UNICODE *u = s1;
15701 while ((*u++ = *s2++))
15702 if (n-- == 0)
15703 break;
15704 return s1;
15705}
15706
15707Py_UNICODE*
15708Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15709{
15710 Py_UNICODE *u1 = s1;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015711 u1 += wcslen(u1);
15712 while ((*u1++ = *s2++));
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015713 return s1;
15714}
15715
15716int
15717Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15718{
15719 while (*s1 && *s2 && *s1 == *s2)
15720 s1++, s2++;
15721 if (*s1 && *s2)
15722 return (*s1 < *s2) ? -1 : +1;
15723 if (*s1)
15724 return 1;
15725 if (*s2)
15726 return -1;
15727 return 0;
15728}
15729
15730int
15731Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15732{
Antoine Pitrou9ed5f272013-08-13 20:18:52 +020015733 Py_UNICODE u1, u2;
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015734 for (; n != 0; n--) {
15735 u1 = *s1;
15736 u2 = *s2;
15737 if (u1 != u2)
15738 return (u1 < u2) ? -1 : +1;
15739 if (u1 == '\0')
15740 return 0;
15741 s1++;
15742 s2++;
15743 }
15744 return 0;
15745}
15746
15747Py_UNICODE*
15748Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15749{
15750 const Py_UNICODE *p;
15751 for (p = s; *p; p++)
15752 if (*p == c)
15753 return (Py_UNICODE*)p;
15754 return NULL;
15755}
15756
15757Py_UNICODE*
15758Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15759{
15760 const Py_UNICODE *p;
Serhiy Storchaka460bd0d2016-11-20 12:16:46 +020015761 p = s + wcslen(s);
Martin v. Löwis0d3072e2011-10-31 08:40:56 +010015762 while (p != s) {
15763 p--;
15764 if (*p == c)
15765 return (Py_UNICODE*)p;
15766 }
15767 return NULL;
15768}
Victor Stinner331ea922010-08-10 16:37:20 +000015769
Victor Stinner71133ff2010-09-01 23:43:53 +000015770Py_UNICODE*
Victor Stinner9db1a8b2011-10-23 20:04:37 +020015771PyUnicode_AsUnicodeCopy(PyObject *unicode)
Victor Stinner71133ff2010-09-01 23:43:53 +000015772{
Victor Stinner577db2c2011-10-11 22:12:48 +020015773 Py_UNICODE *u, *copy;
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015774 Py_ssize_t len, size;
Victor Stinner71133ff2010-09-01 23:43:53 +000015775
Martin v. Löwisd63a3b82011-09-28 07:41:54 +020015776 if (!PyUnicode_Check(unicode)) {
15777 PyErr_BadArgument();
15778 return NULL;
15779 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015780 u = PyUnicode_AsUnicodeAndSize(unicode, &len);
Victor Stinner577db2c2011-10-11 22:12:48 +020015781 if (u == NULL)
15782 return NULL;
Victor Stinner71133ff2010-09-01 23:43:53 +000015783 /* Ensure we won't overflow the size. */
Gregory P. Smith8486f9b2014-09-30 00:33:24 -070015784 if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
Victor Stinner71133ff2010-09-01 23:43:53 +000015785 PyErr_NoMemory();
15786 return NULL;
15787 }
Victor Stinner57ffa9d2011-10-23 20:10:08 +020015788 size = len + 1; /* copy the null character */
Victor Stinner71133ff2010-09-01 23:43:53 +000015789 size *= sizeof(Py_UNICODE);
15790 copy = PyMem_Malloc(size);
15791 if (copy == NULL) {
15792 PyErr_NoMemory();
15793 return NULL;
15794 }
Victor Stinner577db2c2011-10-11 22:12:48 +020015795 memcpy(copy, u, size);
Victor Stinner71133ff2010-09-01 23:43:53 +000015796 return copy;
15797}
Martin v. Löwis5b222132007-06-10 09:51:05 +000015798
Victor Stinnerfecc4f22019-03-19 14:20:29 +010015799
Victor Stinner709d23d2019-05-02 14:56:30 -040015800static int
15801encode_wstr_utf8(wchar_t *wstr, char **str, const char *name)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015802{
Victor Stinner709d23d2019-05-02 14:56:30 -040015803 int res;
15804 res = _Py_EncodeUTF8Ex(wstr, str, NULL, NULL, 1, _Py_ERROR_STRICT);
15805 if (res == -2) {
15806 PyErr_Format(PyExc_RuntimeWarning, "cannot decode %s", name);
15807 return -1;
15808 }
15809 if (res < 0) {
15810 PyErr_NoMemory();
15811 return -1;
15812 }
15813 return 0;
15814}
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015815
Victor Stinner709d23d2019-05-02 14:56:30 -040015816
15817static int
15818config_get_codec_name(wchar_t **config_encoding)
15819{
15820 char *encoding;
15821 if (encode_wstr_utf8(*config_encoding, &encoding, "stdio_encoding") < 0) {
15822 return -1;
15823 }
15824
15825 PyObject *name_obj = NULL;
15826 PyObject *codec = _PyCodec_Lookup(encoding);
15827 PyMem_RawFree(encoding);
15828
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015829 if (!codec)
15830 goto error;
15831
15832 name_obj = PyObject_GetAttrString(codec, "name");
15833 Py_CLEAR(codec);
15834 if (!name_obj) {
15835 goto error;
15836 }
15837
Victor Stinner709d23d2019-05-02 14:56:30 -040015838 wchar_t *wname = PyUnicode_AsWideCharString(name_obj, NULL);
15839 Py_DECREF(name_obj);
15840 if (wname == NULL) {
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015841 goto error;
15842 }
15843
Victor Stinner709d23d2019-05-02 14:56:30 -040015844 wchar_t *raw_wname = _PyMem_RawWcsdup(wname);
15845 if (raw_wname == NULL) {
15846 PyMem_Free(wname);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015847 PyErr_NoMemory();
Victor Stinner709d23d2019-05-02 14:56:30 -040015848 goto error;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015849 }
Victor Stinner709d23d2019-05-02 14:56:30 -040015850
15851 PyMem_RawFree(*config_encoding);
15852 *config_encoding = raw_wname;
15853
15854 PyMem_Free(wname);
15855 return 0;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015856
15857error:
15858 Py_XDECREF(codec);
15859 Py_XDECREF(name_obj);
Victor Stinner709d23d2019-05-02 14:56:30 -040015860 return -1;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015861}
15862
15863
Victor Stinner331a6a52019-05-27 16:39:22 +020015864static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015865init_stdio_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015866{
Victor Stinner709d23d2019-05-02 14:56:30 -040015867 /* Update the stdio encoding to the normalized Python codec name. */
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015868 PyConfig *config = &tstate->interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015869 if (config_get_codec_name(&config->stdio_encoding) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015870 return _PyStatus_ERR("failed to get the Python codec name "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015871 "of the stdio encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015872 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015873 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015874}
15875
15876
Victor Stinner709d23d2019-05-02 14:56:30 -040015877static int
15878init_fs_codec(PyInterpreterState *interp)
15879{
Victor Stinner331a6a52019-05-27 16:39:22 +020015880 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015881
15882 _Py_error_handler error_handler;
15883 error_handler = get_error_handler_wide(config->filesystem_errors);
15884 if (error_handler == _Py_ERROR_UNKNOWN) {
15885 PyErr_SetString(PyExc_RuntimeError, "unknow filesystem error handler");
15886 return -1;
15887 }
15888
15889 char *encoding, *errors;
15890 if (encode_wstr_utf8(config->filesystem_encoding,
15891 &encoding,
15892 "filesystem_encoding") < 0) {
15893 return -1;
15894 }
15895
15896 if (encode_wstr_utf8(config->filesystem_errors,
15897 &errors,
15898 "filesystem_errors") < 0) {
15899 PyMem_RawFree(encoding);
15900 return -1;
15901 }
15902
15903 PyMem_RawFree(interp->fs_codec.encoding);
15904 interp->fs_codec.encoding = encoding;
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015905 /* encoding has been normalized by init_fs_encoding() */
15906 interp->fs_codec.utf8 = (strcmp(encoding, "utf-8") == 0);
Victor Stinner709d23d2019-05-02 14:56:30 -040015907 PyMem_RawFree(interp->fs_codec.errors);
15908 interp->fs_codec.errors = errors;
15909 interp->fs_codec.error_handler = error_handler;
15910
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015911#ifdef _Py_FORCE_UTF8_FS_ENCODING
15912 assert(interp->fs_codec.utf8 == 1);
15913#endif
15914
Victor Stinner709d23d2019-05-02 14:56:30 -040015915 /* At this point, PyUnicode_EncodeFSDefault() and
15916 PyUnicode_DecodeFSDefault() can now use the Python codec rather than
15917 the C implementation of the filesystem encoding. */
15918
15919 /* Set Py_FileSystemDefaultEncoding and Py_FileSystemDefaultEncodeErrors
15920 global configuration variables. */
15921 if (_Py_SetFileSystemEncoding(interp->fs_codec.encoding,
15922 interp->fs_codec.errors) < 0) {
15923 PyErr_NoMemory();
15924 return -1;
15925 }
15926 return 0;
15927}
15928
15929
Victor Stinner331a6a52019-05-27 16:39:22 +020015930static PyStatus
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015931init_fs_encoding(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015932{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015933 PyInterpreterState *interp = tstate->interp;
15934
Victor Stinner709d23d2019-05-02 14:56:30 -040015935 /* Update the filesystem encoding to the normalized Python codec name.
15936 For example, replace "ANSI_X3.4-1968" (locale encoding) with "ascii"
15937 (Python codec name). */
Victor Stinner331a6a52019-05-27 16:39:22 +020015938 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015939 if (config_get_codec_name(&config->filesystem_encoding) < 0) {
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015940 _Py_DumpPathConfig(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015941 return _PyStatus_ERR("failed to get the Python codec "
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015942 "of the filesystem encoding");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015943 }
15944
Victor Stinner709d23d2019-05-02 14:56:30 -040015945 if (init_fs_codec(interp) < 0) {
Victor Stinner331a6a52019-05-27 16:39:22 +020015946 return _PyStatus_ERR("cannot initialize filesystem codec");
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015947 }
Victor Stinner331a6a52019-05-27 16:39:22 +020015948 return _PyStatus_OK();
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015949}
15950
15951
Victor Stinner331a6a52019-05-27 16:39:22 +020015952PyStatus
Victor Stinnerb45d2592019-06-20 00:05:23 +020015953_PyUnicode_InitEncodings(PyThreadState *tstate)
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015954{
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015955 PyStatus status = init_fs_encoding(tstate);
Victor Stinner331a6a52019-05-27 16:39:22 +020015956 if (_PyStatus_EXCEPTION(status)) {
15957 return status;
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015958 }
15959
Victor Stinnerfcdb0272019-09-23 14:45:47 +020015960 return init_stdio_encoding(tstate);
Victor Stinner43fc3bb2019-05-02 11:54:20 -040015961}
15962
15963
Victor Stinnerbf305cc2020-02-05 17:39:57 +010015964static void
15965_PyUnicode_FiniEncodings(PyThreadState *tstate)
15966{
15967 PyInterpreterState *interp = tstate->interp;
15968 PyMem_RawFree(interp->fs_codec.encoding);
15969 interp->fs_codec.encoding = NULL;
15970 interp->fs_codec.utf8 = 0;
15971 PyMem_RawFree(interp->fs_codec.errors);
15972 interp->fs_codec.errors = NULL;
15973 interp->fs_codec.error_handler = _Py_ERROR_UNKNOWN;
15974}
15975
15976
Victor Stinner709d23d2019-05-02 14:56:30 -040015977#ifdef MS_WINDOWS
15978int
15979_PyUnicode_EnableLegacyWindowsFSEncoding(void)
15980{
15981 PyInterpreterState *interp = _PyInterpreterState_GET_UNSAFE();
Victor Stinner331a6a52019-05-27 16:39:22 +020015982 PyConfig *config = &interp->config;
Victor Stinner709d23d2019-05-02 14:56:30 -040015983
15984 /* Set the filesystem encoding to mbcs/replace (PEP 529) */
15985 wchar_t *encoding = _PyMem_RawWcsdup(L"mbcs");
15986 wchar_t *errors = _PyMem_RawWcsdup(L"replace");
15987 if (encoding == NULL || errors == NULL) {
15988 PyMem_RawFree(encoding);
15989 PyMem_RawFree(errors);
15990 PyErr_NoMemory();
15991 return -1;
15992 }
15993
15994 PyMem_RawFree(config->filesystem_encoding);
15995 config->filesystem_encoding = encoding;
15996 PyMem_RawFree(config->filesystem_errors);
15997 config->filesystem_errors = errors;
15998
15999 return init_fs_codec(interp);
16000}
16001#endif
16002
16003
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016004void
Victor Stinner3d483342019-11-22 12:27:50 +010016005_PyUnicode_Fini(PyThreadState *tstate)
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016006{
Victor Stinner3d483342019-11-22 12:27:50 +010016007 if (_Py_IsMainInterpreter(tstate)) {
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016008#if defined(WITH_VALGRIND) || defined(__INSURE__)
Victor Stinner3d483342019-11-22 12:27:50 +010016009 /* Insure++ is a memory analysis tool that aids in discovering
16010 * memory leaks and other memory problems. On Python exit, the
16011 * interned string dictionaries are flagged as being in use at exit
16012 * (which it is). Under normal circumstances, this is fine because
16013 * the memory will be automatically reclaimed by the system. Under
16014 * memory debugging, it's a huge source of useless noise, so we
16015 * trade off slower shutdown for less distraction in the memory
16016 * reports. -baw
16017 */
16018 unicode_release_interned();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016019#endif /* __INSURE__ */
16020
Victor Stinner3d483342019-11-22 12:27:50 +010016021 Py_CLEAR(unicode_empty);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016022
Victor Stinner3d483342019-11-22 12:27:50 +010016023 for (Py_ssize_t i = 0; i < 256; i++) {
16024 Py_CLEAR(unicode_latin1[i]);
16025 }
16026 _PyUnicode_ClearStaticStrings();
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016027 }
Victor Stinner709d23d2019-05-02 14:56:30 -040016028
Victor Stinnerbf305cc2020-02-05 17:39:57 +010016029 _PyUnicode_FiniEncodings(tstate);
Victor Stinnerfecc4f22019-03-19 14:20:29 +010016030}
16031
16032
Georg Brandl66c221e2010-10-14 07:04:07 +000016033/* A _string module, to export formatter_parser and formatter_field_name_split
16034 to the string.Formatter class implemented in Python. */
16035
16036static PyMethodDef _string_methods[] = {
16037 {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
16038 METH_O, PyDoc_STR("split the argument as a field name")},
16039 {"formatter_parser", (PyCFunction) formatter_parser,
16040 METH_O, PyDoc_STR("parse the argument as a format string")},
16041 {NULL, NULL}
16042};
16043
16044static struct PyModuleDef _string_module = {
16045 PyModuleDef_HEAD_INIT,
16046 "_string",
16047 PyDoc_STR("string helper module"),
16048 0,
16049 _string_methods,
16050 NULL,
16051 NULL,
16052 NULL,
16053 NULL
16054};
16055
16056PyMODINIT_FUNC
16057PyInit__string(void)
16058{
16059 return PyModule_Create(&_string_module);
16060}
16061
16062
Thomas Wouters49fd7fa2006-04-21 10:40:58 +000016063#ifdef __cplusplus
16064}
16065#endif